├── README.md
├── weibo
├── README.md
├── src
│ └── main
│ │ └── java
│ │ └── com
│ │ └── github
│ │ └── wycm
│ │ ├── WeiboReplyCrawler.java
│ │ ├── WeiboCrawler.java
│ │ └── HttpClientUtil.java
└── pom.xml
├── www.zhihu.com
├── README.md
├── pom.xml
└── src
│ └── main
│ └── java
│ └── com
│ └── github
│ └── wycm
│ └── ZhihuCrawler.java
├── music163
├── .DS_Store
├── .gitignore
├── src
│ ├── test
│ │ └── java
│ │ │ └── com
│ │ │ └── github
│ │ │ └── wycm
│ │ │ └── AppTest.java
│ └── main
│ │ ├── resources
│ │ ├── log4j-dev.properties
│ │ ├── log4j-prod.properties
│ │ └── log4j.properties
│ │ └── java
│ │ └── com
│ │ └── github
│ │ └── wycm
│ │ └── Music163.java
├── server-auto-deploy.sh
└── pom.xml
├── www.douban.com
├── README.md
├── pom.xml
└── src
│ └── main
│ └── java
│ └── com
│ └── github
│ └── wycm
│ └── DoubanCrawler.java
├── .gitignore
├── www.dianping.com
├── README.md
├── pom.xml
└── src
│ ├── test
│ ├── java
│ │ └── com
│ │ │ └── github
│ │ │ └── wycm
│ │ │ └── DianpingCrawlerTest.java
│ └── resources
│ │ └── test.html
│ └── main
│ └── java
│ └── com
│ └── github
│ └── wycm
│ └── DianpingCrawler.java
└── www.goubanjia.com
├── pom.xml
├── src
└── main
│ └── java
│ └── com
│ └── github
│ └── wycm
│ └── GoubanjiaCrawler.java
└── README.md
/README.md:
--------------------------------------------------------------------------------
1 | crawler-set
2 | ===========
3 | * 各种网站爬虫合集,持续更新中....
--------------------------------------------------------------------------------
/weibo/README.md:
--------------------------------------------------------------------------------
1 | crawler-set
2 | ===========
3 |
4 | ## 使用selenium抓取豆瓣图书搜索动态页面
--------------------------------------------------------------------------------
/www.zhihu.com/README.md:
--------------------------------------------------------------------------------
1 | crawler-set
2 | ===========
3 | ## 通过浏览器cookie获取知乎首页的简单demo
--------------------------------------------------------------------------------
/music163/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/wycm/crawler-set/HEAD/music163/.DS_Store
--------------------------------------------------------------------------------
/www.douban.com/README.md:
--------------------------------------------------------------------------------
1 | crawler-set
2 | ===========
3 |
4 | ## 使用selenium抓取豆瓣图书搜索动态页面
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | # idea
2 | .idea/
3 | *.iml
4 |
5 | target/
6 | .classpath
7 | .project
8 | .settings/
9 |
--------------------------------------------------------------------------------
/music163/.gitignore:
--------------------------------------------------------------------------------
1 | # idea
2 | .idea/
3 | *.iml
4 |
5 | target/
6 | .classpath
7 | .project
8 | .settings/
9 |
10 | zhihucookies
11 | proxies
12 | zhiHuYZM.gif
--------------------------------------------------------------------------------
/www.dianping.com/README.md:
--------------------------------------------------------------------------------
1 | ## 摘要
2 | * 大众点评字体反爬分析解决
3 | * 待更新
4 |
5 | >
6 |
一个程序员日常分享,包括但不限于爬虫、Java后端技术,欢迎关注。
--------------------------------------------------------------------------------
/weibo/src/main/java/com/github/wycm/WeiboReplyCrawler.java:
--------------------------------------------------------------------------------
1 | package com.github.wycm;
2 |
3 | import org.apache.http.impl.cookie.BasicClientCookie;
4 |
5 | import java.util.Calendar;
6 |
7 | public class WeiboReplyCrawler {
8 |
9 | }
10 |
--------------------------------------------------------------------------------
/music163/src/test/java/com/github/wycm/AppTest.java:
--------------------------------------------------------------------------------
1 | package com.github.wycm;
2 |
3 | import static org.junit.Assert.assertTrue;
4 |
5 | import org.junit.Test;
6 |
7 | /**
8 | * Unit test for simple App.
9 | */
10 | public class AppTest
11 | {
12 | /**
13 | * Rigorous Test :-)
14 | */
15 | @Test
16 | public void shouldAnswerWithTrue()
17 | {
18 | assertTrue( true );
19 | }
20 | }
21 |
--------------------------------------------------------------------------------
/music163/server-auto-deploy.sh:
--------------------------------------------------------------------------------
1 | sudo ps -ef |grep music163 | grep -v grep |awk '{print $2}'|xargs kill -9
2 | sudo ps -ef |grep chrome | grep -v grep |awk '{print $2}'|xargs kill -9
3 |
4 | cd /alidata/server/workspace/music163
5 | git pull origin master
6 | echo 'pull success'
7 | mvn clean
8 | mvn -Pprod package assembly:single
9 | echo 'mvn success'
10 | export DISPLAY=:99
11 | cd target
12 | java -jar music163-1.0-SNAPSHOT-jar-with-dependencies.jar &
13 |
--------------------------------------------------------------------------------
/music163/src/main/resources/log4j-dev.properties:
--------------------------------------------------------------------------------
1 | log4j.rootLogger=INFO, stdout
2 |
3 | log4j.appender.stdout=org.apache.log4j.ConsoleAppender
4 | log4j.appender.stdout.Threshold=INFO
5 | log4j.appender.stdout.layout=org.apache.log4j.PatternLayout
6 | log4j.appender.stdout.layout.ConversionPattern=%d %p [%x,%t] - [%c] - %m%n
7 |
8 | log4j.appender.A1=org.apache.log4j.DailyRollingFileAppender
9 | log4j.appender.A1.Threshold=INFO
10 | log4j.appender.A1.layout=org.apache.log4j.PatternLayout
11 | log4j.appender.A1.layout.ConversionPattern=%d %p [%x,%t] - [%c] - <%m>%n
12 | log4j.appender.A1.DatePattern='.'yyyyMMdd
13 | log4j.appender.A1.File=d:/log/all.log
14 |
--------------------------------------------------------------------------------
/music163/src/main/resources/log4j-prod.properties:
--------------------------------------------------------------------------------
1 | log4j.rootLogger=INFO, A1
2 |
3 | log4j.appender.stdout=org.apache.log4j.ConsoleAppender
4 | log4j.appender.stdout.Threshold=INFO
5 | log4j.appender.stdout.layout=org.apache.log4j.PatternLayout
6 | log4j.appender.stdout.layout.ConversionPattern=%d %p [%x,%t] - [%c] - %m%n
7 |
8 | log4j.appender.A1=org.apache.log4j.DailyRollingFileAppender
9 | log4j.appender.A1.Threshold=INFO
10 | log4j.appender.A1.layout=org.apache.log4j.PatternLayout
11 | log4j.appender.A1.layout.ConversionPattern=%d %p [%x,%t] - [%c] - <%m>%n
12 | log4j.appender.A1.DatePattern='.'yyyyMMdd
13 | log4j.appender.A1.File=/alidata/log/music163.log
14 |
--------------------------------------------------------------------------------
/music163/src/main/resources/log4j.properties:
--------------------------------------------------------------------------------
1 | log4j.rootLogger=${log4j.rootLogger}
2 |
3 | log4j.appender.stdout=${log4j.appender.stdout}
4 | log4j.appender.stdout.Threshold=${log4j.appender.stdout.Threshold}
5 | log4j.appender.stdout.layout=${log4j.appender.stdout.layout}
6 | log4j.appender.stdout.layout.ConversionPattern=${log4j.appender.stdout.layout.ConversionPattern}
7 |
8 | log4j.appender.A1=${log4j.appender.A1}
9 | log4j.appender.A1.Threshold=${log4j.appender.A1.Threshold}
10 | log4j.appender.A1.layout=${log4j.appender.A1.layout}
11 | log4j.appender.A1.layout.ConversionPattern=${log4j.appender.A1.layout.ConversionPattern}
12 | log4j.appender.A1.DatePattern=${log4j.appender.A1.DatePattern}
13 | log4j.appender.A1.File=${log4j.appender.A1.File}
--------------------------------------------------------------------------------
/www.goubanjia.com/pom.xml:
--------------------------------------------------------------------------------
1 |
3 | 4.0.0
4 | com.github.wycm
5 | goubanjia
6 | jar
7 | 1.0-SNAPSHOT
8 | goubanjia-crawler
9 | http://maven.apache.org
10 |
11 |
12 |
13 |
14 | org.jsoup
15 | jsoup
16 | 1.10.2
17 |
18 |
19 |
20 |
21 |
22 |
23 | org.apache.maven.plugins
24 | maven-compiler-plugin
25 | 2.0.2
26 |
27 | 1.7
28 | 1.7
29 |
30 |
31 |
32 |
33 |
34 |
--------------------------------------------------------------------------------
/www.zhihu.com/pom.xml:
--------------------------------------------------------------------------------
1 |
3 | 4.0.0
4 | com.github.wycm
5 | zhihu
6 | jar
7 | 1.0-SNAPSHOT
8 | zhihu-crawler
9 | http://maven.apache.org
10 |
11 |
12 |
13 |
14 | org.jsoup
15 | jsoup
16 | 1.10.2
17 |
18 |
19 |
20 | org.apache.httpcomponents
21 | httpclient
22 | 4.5
23 |
24 |
25 |
26 |
27 |
28 |
29 | org.apache.maven.plugins
30 | maven-compiler-plugin
31 | 2.0.2
32 |
33 | 1.7
34 | 1.7
35 |
36 |
37 |
38 |
39 |
40 |
--------------------------------------------------------------------------------
/www.douban.com/pom.xml:
--------------------------------------------------------------------------------
1 |
3 | 4.0.0
4 | com.github.wycm
5 | douban
6 | jar
7 | 1.0-SNAPSHOT
8 | selenium-geetest-crack
9 | http://maven.apache.org
10 |
11 |
12 |
13 | org.seleniumhq.selenium
14 | selenium-server
15 | 3.0.1
16 |
17 |
18 |
19 | org.jsoup
20 | jsoup
21 | 1.7.2
22 |
23 |
24 |
25 |
26 |
27 |
28 | org.apache.maven.plugins
29 | maven-compiler-plugin
30 | 2.0.2
31 |
32 | 1.7
33 | 1.7
34 |
35 |
36 |
37 |
38 |
39 |
--------------------------------------------------------------------------------
/www.dianping.com/pom.xml:
--------------------------------------------------------------------------------
1 |
3 | 4.0.0
4 | com.github.wycm
5 | dianping
6 | jar
7 | 1.0-SNAPSHOT
8 | dianping-crawler
9 | http://maven.apache.org
10 |
11 |
12 |
13 | org.jsoup
14 | jsoup
15 | 1.10.2
16 |
17 |
18 | org.apache.httpcomponents
19 | httpclient
20 | 4.5
21 |
22 |
23 | junit
24 | junit
25 | 4.12
26 |
27 |
28 |
29 |
30 |
31 | org.apache.maven.plugins
32 | maven-compiler-plugin
33 | 2.0.2
34 |
35 | 1.8
36 | 1.8
37 |
38 |
39 |
40 |
41 |
42 |
--------------------------------------------------------------------------------
/www.dianping.com/src/test/java/com/github/wycm/DianpingCrawlerTest.java:
--------------------------------------------------------------------------------
1 | package com.github.wycm;
2 |
3 | import org.apache.http.client.methods.CloseableHttpResponse;
4 | import org.apache.http.client.methods.HttpGet;
5 | import org.apache.http.impl.client.HttpClients;
6 | import org.apache.http.util.EntityUtils;
7 | import org.jsoup.Jsoup;
8 | import org.jsoup.select.Elements;
9 | import org.junit.Test;
10 |
11 | import java.io.BufferedReader;
12 | import java.io.IOException;
13 | import java.io.InputStreamReader;
14 | import java.util.stream.Collectors;
15 |
16 | import static org.junit.Assert.*;
17 |
18 | /**
19 | * Created by wycm on 2018/11/19.
20 | */
21 | public class DianpingCrawlerTest {
22 | @Test
23 | public void testJsoup(){
24 | String s = new BufferedReader(new InputStreamReader(this.getClass().getResourceAsStream("/test.html"))).lines()
25 | .parallel().collect(Collectors.joining("\n"));
26 | Elements elements = Jsoup.parse(s).select("textPath[xlink:href='#1']");
27 | System.out.println(elements);
28 | }
29 | @Test
30 | public void testHttp() throws IOException {
31 | CloseableHttpResponse response = HttpClients.createDefault().execute(new HttpGet("http://s3plus.meituan.net/v1/mss_0a06a471f9514fc79c981b5466f56b91/svgtextcss/807789f715a7caed8e7c2475dcf94e20.svg"));
32 | System.out.println(EntityUtils.toString(response.getEntity(), "utf-8"));
33 | }
34 | }
--------------------------------------------------------------------------------
/weibo/pom.xml:
--------------------------------------------------------------------------------
1 |
3 | 4.0.0
4 | com.github.wycm
5 | weibo
6 | jar
7 | 1.0-SNAPSHOT
8 | selenium-geetest-crack
9 | http://maven.apache.org
10 |
11 |
12 |
13 | org.jsoup
14 | jsoup
15 | 1.7.2
16 |
17 |
18 | org.apache.httpcomponents
19 | httpclient
20 | 4.5
21 |
22 |
23 | log4j
24 | log4j
25 | 1.2.15
26 |
27 |
28 | com.jayway.jsonpath
29 | json-path
30 | 2.2.0
31 |
32 |
33 |
34 |
35 |
36 | org.apache.maven.plugins
37 | maven-compiler-plugin
38 | 2.0.2
39 |
40 | 1.7
41 | 1.7
42 |
43 |
44 |
45 |
46 |
47 |
--------------------------------------------------------------------------------
/www.douban.com/src/main/java/com/github/wycm/DoubanCrawler.java:
--------------------------------------------------------------------------------
1 | package com.github.wycm;
2 |
3 | import org.jsoup.Jsoup;
4 | import org.jsoup.nodes.Document;
5 | import org.jsoup.nodes.Element;
6 | import org.jsoup.select.Elements;
7 | import org.openqa.selenium.By;
8 | import org.openqa.selenium.WebDriver;
9 | import org.openqa.selenium.WebElement;
10 | import org.openqa.selenium.chrome.ChromeDriver;
11 | import org.openqa.selenium.support.ui.ExpectedCondition;
12 | import org.openqa.selenium.support.ui.WebDriverWait;
13 |
14 | /**
15 | * 豆瓣selenium爬虫
16 | * 运行需要下载chromedirever,并修改代码中的chromedirver地址
17 | */
18 | public class DoubanCrawler {
19 | private static WebDriver driver;
20 | static {
21 | System.setProperty("webdriver.chrome.driver", "D:/dev/selenium/chromedriver_V2.30/chromedriver_win32/chromedriver.exe");
22 | driver = new ChromeDriver();
23 | }
24 | public static void main(String[] args) throws InterruptedException {
25 | douban();
26 | driver.quit();
27 | }
28 | private static void douban(){
29 | driver.get("https://book.douban.com/subject_search?search_text=%E4%BA%92%E8%81%94%E7%BD%91&cat=1001");
30 | By by = By.cssSelector("div[id='root']");
31 | waitForLoad(driver, by);
32 | String pageSource = driver.getPageSource();
33 | Document document = Jsoup.parse(pageSource);
34 | Elements elements = document.select("a[class=title-text]");
35 | for(Element element: elements){
36 | System.out.println(element.text());
37 | }
38 | }
39 | /**
40 | * 等待元素加载,10s超时
41 | * @param driver
42 | * @param by
43 | */
44 | public static void waitForLoad(final WebDriver driver, final By by){
45 | new WebDriverWait(driver, 10).until(new ExpectedCondition() {
46 | public Boolean apply(WebDriver d) {
47 | WebElement element = driver.findElement(by);
48 | if (element != null){
49 | return true;
50 | }
51 | return false;
52 | }
53 | });
54 | }
55 | }
56 |
--------------------------------------------------------------------------------
/www.goubanjia.com/src/main/java/com/github/wycm/GoubanjiaCrawler.java:
--------------------------------------------------------------------------------
1 | package com.github.wycm;
2 |
3 | import org.jsoup.Jsoup;
4 | import org.jsoup.nodes.Document;
5 | import org.jsoup.nodes.Element;
6 | import org.jsoup.select.Elements;
7 |
8 | import java.io.IOException;
9 | import java.net.URL;
10 |
11 | public class GoubanjiaCrawler {
12 | public static void main(String[] args) throws IOException {
13 | Document document= Jsoup.parse(new URL("http://www.goubanjia.com/"), 10000);
14 | setPort(document);
15 | //获取class='table'的table的所有子节点tr
16 | Elements elements = document.select("table[class=table] tr");
17 | for (int i = 1; i < elements.size(); i++){
18 | //获取td节点
19 | Element td = elements.get(i).select("td").first();
20 | /**
21 | * 查找所有style属性包含none字符串的标签(页面上未显示的标签),并移除
22 | * 包括以下两种
23 | * style=display: none;
24 | * style=display:none;
25 | */
26 | for(Element none : td.select("[style*=none;]")){
27 | none.remove();
28 | }
29 | //移除空格
30 | String ipPort = td.text().replaceAll(" ", "");
31 | //打印
32 | System.out.println(ipPort);
33 | }
34 | }
35 |
36 | /**
37 | * js代码port还原
38 | * @param doc
39 | */
40 | private static void setPort(Document doc){
41 | for (Element e : doc.select(".port")){//$('.port').each(function() {
42 | String a = e.text();//var a = $(this).html();
43 | if(a.indexOf("*") != -0x1){//if (a.indexOf('*') != -0x1) {
44 | return;
45 | }
46 | String b = e.attr("class");//var b = $(this).attr('class');
47 | b = b.split(" ")[0x1];//b = (b.split(" "))[0x1];
48 | String[] c = b.split("");//var c = b.split("");
49 | int d = b.length();//var d = c.length;
50 | StringBuilder f = new StringBuilder();//var f = [];
51 | for(int g = 0x0; g < d; g++){//for (var g = 0x0; g < d; g++) {
52 | f.append("ABCDEFGHIZ".indexOf(c[g]));//f.push('ABCDEFGHIZ'.indexOf(c[g]))
53 | }
54 | e.text(String.valueOf(Integer.valueOf(f.toString()) >> 0x3));//$(this).html(window.parseInt(f.join('')) >> 0x3)
55 | }
56 | }
57 | }
--------------------------------------------------------------------------------
/www.zhihu.com/src/main/java/com/github/wycm/ZhihuCrawler.java:
--------------------------------------------------------------------------------
1 | package com.github.wycm;
2 |
3 | import org.apache.http.client.CookieStore;
4 | import org.apache.http.client.methods.CloseableHttpResponse;
5 | import org.apache.http.client.methods.HttpGet;
6 | import org.apache.http.client.protocol.HttpClientContext;
7 | import org.apache.http.impl.client.BasicCookieStore;
8 | import org.apache.http.impl.client.CloseableHttpClient;
9 | import org.apache.http.impl.client.HttpClients;
10 | import org.apache.http.impl.client.LaxRedirectStrategy;
11 | import org.apache.http.impl.cookie.BasicClientCookie;
12 | import org.apache.http.util.EntityUtils;
13 | import org.jsoup.Jsoup;
14 | import org.jsoup.nodes.Document;
15 | import org.jsoup.nodes.Element;
16 | import org.jsoup.select.Elements;
17 |
18 | import java.io.IOException;
19 | import java.net.URL;
20 | import java.util.Calendar;
21 |
22 | public class ZhihuCrawler {
23 | private final static String RAW_COOKIES = "拷贝浏览器知乎cookie至此";
24 | private static final String userAgent = "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/57.0.2987.133 Safari/537.36";
25 | private static final CloseableHttpClient httpClient = HttpClients.custom()
26 | .setUserAgent(userAgent)
27 | //设置post默认重定向
28 | .setRedirectStrategy(new LaxRedirectStrategy())
29 | .build();
30 | private static final HttpClientContext httpClientContext = HttpClientContext.create();
31 | private static CookieStore cookieStore = new BasicCookieStore();
32 |
33 |
34 | static {
35 | for (String rawCookie : RAW_COOKIES.split("; ")){
36 | String[] s = rawCookie.split("=");
37 | BasicClientCookie cookie = new BasicClientCookie(s[0], s[1]);
38 | cookie.setDomain("zhihu.com");
39 | cookie.setPath("/");
40 | cookie.setSecure(false);
41 | cookie.setAttribute("domain", "zhihu.com");
42 | Calendar calendar = Calendar.getInstance();
43 | calendar.add(Calendar.DAY_OF_MONTH, +5);
44 | cookie.setExpiryDate(calendar.getTime());
45 | cookieStore.addCookie(cookie);
46 | }
47 | httpClientContext.setCookieStore(cookieStore);
48 | }
49 | public static void main(String[] args) throws IOException {
50 | HttpGet httpGet = new HttpGet("https://www.zhihu.com");
51 | CloseableHttpResponse response = httpClient.execute(httpGet, httpClientContext);
52 | String s = EntityUtils.toString(response.getEntity());
53 | Document document = Jsoup.parse(s);
54 | Elements elements = document.select("div[class=ContentItem AnswerItem]");
55 | for (Element e : elements){
56 | // System.out.println(e);
57 | System.out.println(e.select("meta[itemprop=url]").first().attr("content"));
58 | System.out.println(e.select("meta[itemprop=name]").first().attr("content"));
59 | }
60 | }
61 | }
--------------------------------------------------------------------------------
/weibo/src/main/java/com/github/wycm/WeiboCrawler.java:
--------------------------------------------------------------------------------
1 | package com.github.wycm;
2 |
3 | import com.jayway.jsonpath.DocumentContext;
4 | import com.jayway.jsonpath.JsonPath;
5 | import org.apache.http.client.methods.HttpPost;
6 |
7 | import java.io.IOException;
8 | import java.net.URLEncoder;
9 | import java.util.HashMap;
10 | import java.util.Map;
11 | import java.util.regex.Matcher;
12 | import java.util.regex.Pattern;
13 |
14 | /**
15 | * Created by wangyang on 2017/8/22.
16 | */
17 | public class WeiboCrawler {
18 | private static final String CHECK_URL = "https://login.sina.com.cn/sso/prelogin.php?checkpin=1&entry=mweibo&su=MTMwODgyODA4NjA=&callback=jsonpcallback1503386116934";
19 | private static final String LOGIN_URL = "https://passport.weibo.cn/sso/login";
20 | private static final String POST_ARGS = "username=13268037201&password=password&savestate=1&r=http%3A%2F%2Fm.weibo.cn%2F&ec=0&pagerefer=https%3A%2F%2Fpassport.weibo.cn%2Fsignin%2Fwelcome%3Fentry%3Dmweibo%26r%3Dhttp%253A%252F%252Fm.weibo.cn%252F&entry=mweibo&wentry=&loginfrom=&client_id=&code=&qq=&mainpageflag=1&hff=&hfp=";
21 | private static final String KEYWORD_ARGS = "type=all&queryVal=${keyword}&luicode=10000011&lfid=106003type%3D1&title=${keyword}&containerid=100103type%3D1%26q%3D${keyword}";
22 | /**
23 | * 搜索url
24 | */
25 | public static void main(String[] args) throws IOException {
26 | String searchUrl = "https://m.weibo.cn/api/container/getIndex";
27 | String keyword = "联想电脑";
28 | System.out.println(HttpClientUtil.get(CHECK_URL));
29 | HttpPost post = new HttpPost(LOGIN_URL);
30 | //该header必须要
31 | post.addHeader("Referer", "https://passport.weibo.cn/signin/login?entry=mweibo&res=wel&wm=3349&r=http%3A%2F%2Fm.weibo.cn%2F");
32 | HttpClientUtil.setHttpPostParams(post, queryToMap(POST_ARGS));
33 | String res = HttpClientUtil.getWebPage(post);
34 | System.out.println(res);
35 | System.out.println(HttpClientUtil.get("https://m.weibo.cn/"));
36 |
37 |
38 | String searchArgs = KEYWORD_ARGS.replaceAll("\\$\\{keyword\\}", URLEncoder.encode(keyword, "utf-8"));
39 | searchUrl = searchUrl + "?" + searchArgs;
40 | String searchRes = HttpClientUtil.get(searchUrl);
41 | Pattern pattern = Pattern.compile("idstr\":\"(\\d+)\"");
42 | Matcher matcher = pattern.matcher(searchRes);
43 | while (matcher.find()){
44 | String commentId = matcher.group(1);
45 | System.out.println(commentId);
46 | }
47 | // String result = HttpClientUtil.get("https://m.weibo.cn/api/comments/show?id=4154417035431509&page=1");
48 | // System.out.println(result);
49 | }
50 | /**
51 | * returns the url parameters in a map
52 | * @param query
53 | * @return map
54 | */
55 | public static Map queryToMap(String query){
56 | if (query == null){
57 | query = "";
58 | }
59 | Map result = new HashMap();
60 | for (String param : query.split("&")) {
61 | String pair[] = param.split("=");
62 | if (pair.length>1) {
63 | result.put(pair[0], pair[1]);
64 | }else{
65 | result.put(pair[0], "");
66 | }
67 | }
68 | return result;
69 | }
70 | }
71 |
--------------------------------------------------------------------------------
/music163/pom.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
5 | 4.0.0
6 |
7 | com.github.wycm
8 | music163
9 | 1.0-SNAPSHOT
10 |
11 | music163
12 | http://www.example.com
13 |
14 |
15 | UTF-8
16 | 1.8
17 | 1.8
18 |
19 |
20 |
21 |
22 | org.seleniumhq.selenium
23 | selenium-server
24 | 3.0.1
25 |
26 |
27 | org.slf4j
28 | slf4j-api
29 | 1.7.6
30 |
31 |
32 | org.slf4j
33 | slf4j-log4j12
34 | 1.7.6
35 |
36 |
37 | junit
38 | junit
39 | 4.11
40 | test
41 |
42 |
43 | redis.clients
44 | jedis
45 | 2.5.2
46 |
47 |
48 |
49 |
50 |
51 |
52 | src/main/resources
53 |
54 | **/*
55 |
56 |
57 |
58 | src/main/resources
59 |
60 | log4j.properties
61 |
62 | true
63 |
64 |
65 |
66 |
67 |
70 |
71 | org.apache.maven.plugins
72 | maven-assembly-plugin
73 | 2.5.5
74 |
75 |
76 |
77 | com.github.wycm.Music163
78 |
79 |
80 |
81 | jar-with-dependencies
82 |
83 |
84 |
85 |
86 |
87 |
88 |
89 |
90 | dev
91 |
92 |
93 | src/main/resources/log4j-dev.properties
94 |
95 |
96 |
97 | true
98 |
99 |
100 |
101 | prod
102 |
103 |
104 | src/main/resources/log4j-prod.properties
105 |
106 |
107 |
108 |
109 |
110 |
--------------------------------------------------------------------------------
/music163/src/main/java/com/github/wycm/Music163.java:
--------------------------------------------------------------------------------
1 | package com.github.wycm;
2 |
3 | import org.openqa.selenium.*;
4 | import org.openqa.selenium.chrome.ChromeDriver;
5 | import org.openqa.selenium.chrome.ChromeOptions;
6 | import org.slf4j.Logger;
7 | import org.slf4j.LoggerFactory;
8 |
9 | import java.util.*;
10 | import java.util.concurrent.TimeUnit;
11 | import java.util.regex.Matcher;
12 | import java.util.regex.Pattern;
13 |
14 | /**
15 | * Created by wycm
16 | */
17 | public class Music163 {
18 | private static Logger logger = LoggerFactory.getLogger(Music163.class);
19 |
20 | //拷贝登录成功的浏览器原始cookie
21 | private final static String RAW_COOKIES = "cookie1=value1; cookie2=value2";
22 | private final static String CHROME_DRIVER_PATH = "/Users/wangyang/Downloads/chromedriver";
23 | //歌曲列表id
24 | private static String startId = "22336453";
25 |
26 |
27 | private static String userId = null;
28 | private static Set playListSet = new HashSet<>();
29 | private static Pattern pattern = Pattern.compile("(.*?)(.*?)");
30 | private static Pattern songName = Pattern.compile("class=\"f-thide name fc1 f-fl\" title=\"(.*?)\"");
31 | private static ChromeOptions chromeOptions = new ChromeOptions();
32 | private static WebDriver driver = null;
33 | static {
34 | System.setProperty("webdriver.chrome.driver", CHROME_DRIVER_PATH);
35 | chromeOptions.addArguments("--no-sandbox");
36 | }
37 | public static void main(String[] args) throws InterruptedException {
38 | while (true){
39 | try {
40 | driver = new ChromeDriver(chromeOptions);
41 | playListSet.add(startId);
42 | invoke();
43 | } catch (Exception e){
44 | logger.error(e.getMessage(), e);
45 | } finally {
46 | driver.quit();
47 | }
48 | Thread.sleep(1000 * 10);
49 | }
50 | }
51 |
52 | /**
53 | * 初始化cookies
54 | */
55 | private static void initCookies(){
56 | Arrays.stream(RAW_COOKIES.split("; ")).forEach(rawCookie -> {
57 | String[] ss = rawCookie.split("=");
58 | Cookie cookie = new Cookie.Builder(ss[0], ss[1]).domain(".163.com").build();
59 | driver.manage().addCookie(cookie);
60 | });
61 | }
62 | private static void invoke() throws InterruptedException {
63 | driver.manage().timeouts().implicitlyWait(5, TimeUnit.SECONDS);
64 | driver.manage().timeouts().pageLoadTimeout(15, TimeUnit.SECONDS);
65 | String s = null;
66 | driver.get("http://music.163.com/");
67 | initCookies();
68 | driver.get("http://music.163.com/");
69 | s = driver.getPageSource();
70 | userId = group(s, "userId:(\\d+)", 1);
71 | driver.get("https://music.163.com/#/playlist?id=" + startId);
72 | driver.switchTo().frame("contentFrame");
73 | WebElement element = driver.findElement(By.cssSelector("[id=content-operation]>a:first-child"));
74 | element.click();
75 | ((JavascriptExecutor) driver).executeScript("window.open('about:blank')");
76 | ArrayList tabs = new ArrayList(driver.getWindowHandles());
77 | driver.switchTo().window(tabs.get(0));
78 | driver.switchTo().defaultContent();
79 | int i = 0;
80 | String lastSongName = "";
81 | int count = 0;
82 | while (true){
83 | if(i > Integer.MAX_VALUE - 2){
84 | break;
85 | }
86 | i++;
87 | s = driver.getPageSource();
88 | driver.switchTo().window(tabs.get(1)); //switches to new tab
89 | String songs = null;
90 | try{
91 | driver.get("https://music.163.com/user/home?id=" + userId);
92 | driver.switchTo().frame("contentFrame");
93 | songs = group(driver.getPageSource(), "累积听歌(\\d+)首", 1);
94 | } catch (TimeoutException e){
95 | logger.error(e.getMessage(), e);
96 | }
97 | driver.switchTo().window(tabs.get(0));
98 | Matcher matcher = pattern.matcher(s);
99 | Matcher songNameMatcher = songName.matcher(s);
100 | if (matcher.find() && songNameMatcher.find()){
101 | String songNameStr = songNameMatcher.group(1);
102 | if (!songNameStr.equals(lastSongName)){
103 | count++;
104 | lastSongName = songNameStr;
105 | }
106 | logger.info(songNameStr + "-" + matcher.group(1) + matcher.group(2) + "---当前播放第" + count + "首歌曲, 累计听歌:" + songs);
107 | } else {
108 | logger.info("解析歌曲播放记录或歌曲名失败");
109 | }
110 | Thread.sleep(1000 * 30);
111 | }
112 | }
113 | public static String group(String str, String regex, int index) {
114 | Pattern pattern = Pattern.compile(regex);
115 | Matcher matcher = pattern.matcher(str);
116 | return matcher.find() ? matcher.group(index) : "";
117 | }
118 | }
119 |
--------------------------------------------------------------------------------
/www.goubanjia.com/README.md:
--------------------------------------------------------------------------------
1 | ## 摘要
2 | * 上一篇以知乎网为例简单分享网络请求分析。这一篇主要分享一种应对反爬虫的方法,前端数据混淆。
3 | ## 目的
4 | * 之前写https://github.com/wycm/zhihu-crawler项目的时候,需要用到免费的http代理,然后找到了这个 http://www.goubanjia.com/ 这个网站。现在需要把这个网站上的ip和port爬取下来,有兴趣的朋友也可以尝试自己爬取一下。
5 | ## 开始
6 | * 打开这个网站首页,然后控制台查看ip和port的对应标签。
7 | 
8 | * 如上图(图一),从控制台的标签中可以看出ip加了一些无关不显示的标签来混淆数据,这里混淆的原理其实很简单,通过标签的style="display:none"属性来达到混淆的目的,也就是包含这个属性的标签是不会显示在页面上的。知道了这一点就比较好处理了,只需要在解析的时候把包含style="display:none"属性的标签去掉。就可以轻松的拿到ip和port数据了。
9 | * 代码如下
10 | ```
11 | package com.cnblogs.wycm;
12 |
13 | import org.jsoup.Jsoup;
14 | import org.jsoup.nodes.Document;
15 | import org.jsoup.nodes.Element;
16 | import org.jsoup.select.Elements;
17 | import java.io.IOException;
18 | import java.net.URL;
19 |
20 | /**
21 | *
22 | * 数据的解析采用的是Jsoup框架,Jsoup是一个操作HTML标签的Java库,它提供了非常方便的API来提取和操纵库,支持类似jquery的选择器来查找标签。
23 | * 由于请求比较单一,这里的网络请求并没有采用上一篇所使用HttpClient框架。直接通过Jsoup来执行http请求的。
24 | * 关于Jsoup的使用可以参考http://www.open-open.com/jsoup/
25 | *
26 | */
27 | public class Chapter1 {
28 | public static void main(String[] args) throws IOException {
29 | Document document= Jsoup.parse(new URL("http://www.goubanjia.com/"), 10000);
30 | //获取class='table'的table的所有子节点tr
31 | Elements elements = document.select("table[class=table] tr");
32 | for (int i = 1; i < elements.size(); i++){
33 | //获取td节点
34 | Element td = elements.get(i).select("td").first();
35 | /**
36 | * 查找所有style属性包含none字符串的标签(页面上未显示的标签),并移除
37 | * 包括以下两种
38 | * style=display: none;
39 | * style=display:none;
40 | */
41 | for(Element none : td.select("[style*=none;]")){
42 | none.remove();
43 | }
44 | //移除空格
45 | String ipPort = td.text().replaceAll(" ", "");
46 | //打印
47 | System.out.println(ipPort);
48 | }
49 | }
50 | }
51 | /*
52 | 第一次运行打印结果:
53 | 183.129.246.228:8132
54 | 222.92.136.206:8987
55 | 54.238.186.100:8988
56 | ...
57 | 第二次运行打印结果:
58 | 183.129.246.228:8377
59 | 222.92.136.206:9059
60 | 54.238.186.100:8622
61 | ...
62 | */
63 | ```
64 | * ip地址能够准确的拿到了,却发现port被做了混淆,而且每次返回的port还在动态改变。大家可以通过把浏览器的JavaScrip脚本关闭后,然后刷新这个网页。会发现每次的port都不一样。我们每次看到的正确port都是通过JavaScript脚本处理后的。如果采用普通爬虫的方式拿到的port都是错误的。现在要想拿到正确的port,可以通过分析它JavaScrip脚本还原数据的逻辑。
65 | * 同样打开控制台->选择Sources->选择一行js代码打断点(点击行编号),如下图
66 | 
67 | * 刷新网页—>页面Paused in debugger—>选择Elements->右键td节点->Break on...->subtree modifications。这两个步骤就是在设置断点调试,也就是在td节点发生改变的时候paused。
68 | 
69 | * 选择Sources->F8(继续执行),这个时候又会有一次pause,也就是js脚本在还原正确port的时候(如下图)
70 | 
71 | * 函数的调用栈有好多层,如何快速定位哪一个函数的技巧就是,看它局部变量表的变量变化,因为这里是port在发生改变,然后找到对应变量和对应逻辑函数。简单分析可以确定到port发生改变的函数是一个匿名函数,如下图
72 | 
73 | * 格式化后,代码如下:
74 | ```
75 | var _$ = ['\x2e\x70\x6f\x72\x74', "\x65\x61\x63\x68", "\x68\x74\x6d\x6c", "\x69\x6e\x64\x65\x78\x4f\x66", '\x2a', "\x61\x74\x74\x72", '\x63\x6c\x61\x73\x73', "\x73\x70\x6c\x69\x74", "\x20", "", "\x6c\x65\x6e\x67\x74\x68", "\x70\x75\x73\x68", '\x41\x42\x43\x44\x45\x46\x47\x48\x49\x5a', "\x70\x61\x72\x73\x65\x49\x6e\x74", "\x6a\x6f\x69\x6e", ''];
76 | $(function() {
77 | $(_$[0])[_$[1]](function() {
78 | var a = $(this)[_$[2]]();
79 | if (a[_$[3]](_$[4]) != -0x1) {
80 | return
81 | }
82 | ;var b = $(this)[_$[5]](_$[6]);
83 | try {
84 | b = (b[_$[7]](_$[8]))[0x1];
85 | var c = b[_$[7]](_$[9]);
86 | var d = c[_$[10]];
87 | var f = [];
88 | for (var g = 0x0; g < d; g++) {
89 | f[_$[11]](_$[12][_$[3]](c[g]))
90 | }
91 | ;$(this)[_$[2]](window[_$[13]](f[_$[14]](_$[15])) >> 0x3)
92 | } catch (e) {}
93 | })
94 | })
95 | ```
96 | * 还原后如下:
97 | ```
98 | var _$ = ['.port', "each", "html", "indexOf", '*', "attr", 'class', "split", " ", "", "length", "push", 'ABCDEFGHIZ', "parseInt", "join", ''];
99 | $(function() {
100 | $('.port').each(function() {
101 | var a = $(this).html();
102 | if (a.indexOf('*') != -0x1) {
103 | return
104 | }
105 | ;var b = $(this).attr('class');
106 | try {
107 | b = (b.split(" "))[0x1];
108 | var c = b.split("");
109 | var d = c.length;
110 | var f = [];
111 | for (var g = 0x0; g < d; g++) {
112 | f.push('ABCDEFGHIZ'.indexOf(c[g]))
113 | }
114 | ;$(this).html(window.parseInt(f.join('')) >> 0x3)
115 | } catch (e) {}
116 | })
117 | })
118 | ```
119 | * 这段代码的逻辑,获取port标签的class属性值,取出属性中后面的几个大写字母,遍历该字符串,找出每次字符在'ABCDEFGHIZ'这个字符串中的索引,然后parseInt转换为整数,然后进行右移3位的操作。
120 | * 完整代码实现
121 | ```
122 | package com.cnblogs.wycm;
123 |
124 | import org.jsoup.Jsoup;
125 | import org.jsoup.nodes.Document;
126 | import org.jsoup.nodes.Element;
127 | import org.jsoup.select.Elements;
128 |
129 | import java.io.IOException;
130 | import java.net.URL;
131 |
132 | public class Chapter2 {
133 | public static void main(String[] args) throws IOException {
134 | Document document= Jsoup.parse(new URL("http://www.goubanjia.com/"), 10000);
135 | setPort(document);
136 | //获取class='table'的table的所有子节点tr
137 | Elements elements = document.select("table[class=table] tr");
138 | for (int i = 1; i < elements.size(); i++){
139 | //获取td节点
140 | Element td = elements.get(i).select("td").first();
141 | /**
142 | * 查找所有style属性包含none字符串的标签(页面上未显示的标签),并移除
143 | * 包括以下两种
144 | * style=display: none;
145 | * style=display:none;
146 | */
147 | for(Element none : td.select("[style*=none;]")){
148 | none.remove();
149 | }
150 | //移除空格
151 | String ipPort = td.text().replaceAll(" ", "");
152 | //打印
153 | System.out.println(ipPort);
154 | }
155 | }
156 |
157 | /**
158 | * js代码port还原
159 | * @param doc
160 | */
161 | private static void setPort(Document doc){
162 | for (Element e : doc.select(".port")){//$('.port').each(function() {
163 | String a = e.text();//var a = $(this).html();
164 | if(a.indexOf("*") != -0x1){//if (a.indexOf('*') != -0x1) {
165 | return;
166 | }
167 | String b = e.attr("class");//var b = $(this).attr('class');
168 | b = b.split(" ")[0x1];//b = (b.split(" "))[0x1];
169 | String[] c = b.split("");//var c = b.split("");
170 | int d = b.length();//var d = c.length;
171 | StringBuilder f = new StringBuilder();//var f = [];
172 | for(int g = 0x0; g < d; g++){//for (var g = 0x0; g < d; g++) {
173 | f.append("ABCDEFGHIZ".indexOf(c[g]));//f.push('ABCDEFGHIZ'.indexOf(c[g]))
174 | }
175 | e.text(String.valueOf(Integer.valueOf(f.toString()) >> 0x3));//$(this).html(window.parseInt(f.join('')) >> 0x3)
176 | }
177 | }
178 | }
179 | ```
180 | * maven依赖
181 | ```
182 |
183 | org.jsoup
184 | jsoup
185 | 1.10.2
186 |
187 | ```
188 | ## 总结
189 | * 该篇文章简单分项了下如何应对前端混淆的反爬虫。关于这种反爬虫,还有其它的一些应对方式。如采用无头浏览器的方式,比如phantomjs框架。这种无头浏览器原本是用来做自动化测试的。它是基于webkit内核的,所以它可以较容易的爬取这种前端混淆的这种网站。一般来说浏览器能够正常访问到的数据,这种方式也可以比较容易爬取这些数据。当然这种方式的最大问题就是效率比较低。因为这种方式它每加载一个页面,都需要下载它的附加资源,如js脚本,脚本下载完成后,还要去执行js脚本。
190 | * 我这里采用的方式是阅读js代码,得出前端混淆的逻辑,然后再通过目标语言来实现对应逻辑。这种方式如果针对一些简单的加密混淆还是很有用的。但是当遇到一些大型复杂的网站,如百度、微博等,需要抓取登录后的数据。这时候需要来手动模拟登录,相对来说,这种网站的模拟登录会更复杂,找各种登录参数来源。都会耗费大量精力。分析请求的成本会比较高。这种方式的优点就是爬取速度快,只获取目标数据。不需要额外网络请求成本。
191 |
192 | >
193 |
一个程序员日常分享,包括但不限于爬虫、Java后端技术,欢迎关注。
--------------------------------------------------------------------------------
/weibo/src/main/java/com/github/wycm/HttpClientUtil.java:
--------------------------------------------------------------------------------
1 | package com.github.wycm;
2 |
3 | import org.apache.http.*;
4 | import org.apache.http.client.CookieStore;
5 | import org.apache.http.client.HttpRequestRetryHandler;
6 | import org.apache.http.client.config.CookieSpecs;
7 | import org.apache.http.client.config.RequestConfig;
8 | import org.apache.http.client.entity.UrlEncodedFormEntity;
9 | import org.apache.http.client.methods.CloseableHttpResponse;
10 | import org.apache.http.client.methods.HttpGet;
11 | import org.apache.http.client.methods.HttpPost;
12 | import org.apache.http.client.methods.HttpRequestBase;
13 | import org.apache.http.client.protocol.HttpClientContext;
14 | import org.apache.http.config.ConnectionConfig;
15 | import org.apache.http.config.Registry;
16 | import org.apache.http.config.RegistryBuilder;
17 | import org.apache.http.config.SocketConfig;
18 | import org.apache.http.conn.ConnectTimeoutException;
19 | import org.apache.http.conn.socket.ConnectionSocketFactory;
20 | import org.apache.http.conn.socket.PlainConnectionSocketFactory;
21 | import org.apache.http.conn.ssl.SSLConnectionSocketFactory;
22 | import org.apache.http.conn.ssl.TrustStrategy;
23 | import org.apache.http.cookie.Cookie;
24 | import org.apache.http.impl.client.*;
25 | import org.apache.http.impl.conn.DefaultProxyRoutePlanner;
26 | import org.apache.http.impl.conn.PoolingHttpClientConnectionManager;
27 | import org.apache.http.message.BasicNameValuePair;
28 | import org.apache.http.protocol.HttpContext;
29 | import org.apache.http.ssl.SSLContexts;
30 | import org.apache.log4j.Logger;
31 |
32 | import javax.net.ssl.SSLContext;
33 | import javax.net.ssl.SSLException;
34 | import java.io.*;
35 | import java.net.UnknownHostException;
36 | import java.nio.charset.CodingErrorAction;
37 | import java.security.KeyStore;
38 | import java.security.cert.CertificateException;
39 | import java.security.cert.X509Certificate;
40 | import java.util.ArrayList;
41 | import java.util.List;
42 | import java.util.Map;
43 |
44 | /**
45 | * HttpClient工具类
46 | */
47 | public class HttpClientUtil {
48 | private static Logger logger = Logger.getLogger(HttpClientUtil.class);
49 | private static CloseableHttpClient httpClient;
50 | private final static HttpClientContext httpClientContext = HttpClientContext.create();
51 | private final static String userAgent = "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; QQDownload 1.7; .NET CLR 1.1.4322; CIBA; .NET CLR 2.0.50727)";
52 | private static HttpHost proxy;
53 | private static RequestConfig requestConfig;
54 | static {
55 | init();
56 | }
57 | private static void init() {
58 | try {
59 | SSLContext sslContext =
60 | SSLContexts.custom()
61 | .loadTrustMaterial(KeyStore.getInstance(KeyStore.getDefaultType()), new TrustStrategy() {
62 | @Override
63 | public boolean isTrusted(X509Certificate[] chain, String authType)
64 | throws CertificateException {
65 | return true;
66 | }
67 | }).build();
68 | SSLConnectionSocketFactory sslSFactory =
69 | new SSLConnectionSocketFactory(sslContext);
70 | Registry socketFactoryRegistry =
71 | RegistryBuilder.create()
72 | .register("http", PlainConnectionSocketFactory.INSTANCE).register("https", sslSFactory)
73 | .build();
74 |
75 | PoolingHttpClientConnectionManager connManager =
76 | new PoolingHttpClientConnectionManager(socketFactoryRegistry);
77 |
78 | SocketConfig socketConfig = SocketConfig.custom().setTcpNoDelay(true).build();
79 | connManager.setDefaultSocketConfig(socketConfig);
80 |
81 | ConnectionConfig connectionConfig =
82 | ConnectionConfig.custom().setMalformedInputAction(CodingErrorAction.IGNORE)
83 | .setUnmappableInputAction(CodingErrorAction.IGNORE).setCharset(Consts.UTF_8).build();
84 | connManager.setDefaultConnectionConfig(connectionConfig);
85 | connManager.setMaxTotal(300);
86 | connManager.setDefaultMaxPerRoute(100);
87 |
88 | HttpRequestRetryHandler retryHandler = new HttpRequestRetryHandler() {
89 | @Override
90 | public boolean retryRequest(IOException exception, int executionCount, HttpContext context) {
91 | if (executionCount > 0) {
92 | return false;
93 | }
94 | if (exception instanceof InterruptedIOException) {
95 | return true;
96 | }
97 | if (exception instanceof ConnectTimeoutException) {
98 | return true;
99 | }
100 | if (exception instanceof UnknownHostException) {
101 | return true;
102 | }
103 | if (exception instanceof SSLException) {
104 | return true;
105 | }
106 | HttpRequest request = HttpClientContext.adapt(context).getRequest();
107 | if (!(request instanceof HttpEntityEnclosingRequest)) {
108 | return true;
109 | }
110 | return false;
111 | }
112 | };
113 |
114 | HttpClientBuilder httpClientBuilder =
115 | HttpClients.custom().setConnectionManager(connManager).setRetryHandler(retryHandler)
116 | //设置post默认重定向
117 | .setRedirectStrategy(new LaxRedirectStrategy())
118 | .setDefaultCookieStore(new BasicCookieStore()).setUserAgent(userAgent);
119 | if (proxy != null) {
120 | httpClientBuilder.setRoutePlanner(new DefaultProxyRoutePlanner(proxy)).build();
121 | }
122 | httpClient = httpClientBuilder.build();
123 |
124 | requestConfig = RequestConfig.custom().setSocketTimeout(10000).
125 | setConnectTimeout(10000).
126 | setConnectionRequestTimeout(10000).
127 | setCookieSpec(CookieSpecs.STANDARD).
128 | build();
129 | } catch (Exception e) {
130 | logger.error(e.getMessage());
131 | }
132 | }
133 | public static String get(String url) throws IOException {
134 | HttpGet request = new HttpGet(url);
135 | return getWebPage(request, null, "UTF-8", false);
136 | }
137 | public static String get(HttpRequestBase request, RequestConfig config) throws IOException {
138 | return getWebPage(request, config, "UTF-8", false);
139 | }
140 | public static String getWebPage(HttpRequestBase request) throws IOException {
141 | return getWebPage(request, null, "UTF-8", false);
142 | }
143 | public static String getWebPage(HttpRequestBase request, RequestConfig config) throws IOException {
144 | return getWebPage(request, config, "UTF-8", false);
145 | }
146 | /**
147 | *
148 | * @param request 请求
149 | * @param encoding 字符编码
150 | * @param isPrintConsole 是否打印到控制台
151 | * @return 网页内容
152 | */
153 | public static String getWebPage(HttpRequestBase request,
154 | RequestConfig config,
155 | String encoding,
156 | boolean isPrintConsole) throws IOException {
157 | CloseableHttpResponse response = null;
158 | if (config != null){
159 | request.setConfig(config);
160 | }
161 | else {
162 | request.setConfig(requestConfig);
163 | }
164 | response = httpClient.execute(request, httpClientContext);
165 | logger.info("status---" + response.getStatusLine().getStatusCode());
166 | BufferedReader rd = null;
167 | StringBuilder webPage = null;
168 | try {
169 | rd = new BufferedReader(
170 | new InputStreamReader(response.getEntity().getContent(),encoding));
171 | String line = "";
172 | webPage = new StringBuilder();
173 | while((line = rd.readLine()) != null) {
174 | webPage.append(line);
175 | if(isPrintConsole){
176 | System.out.println(line);
177 | }
178 | }
179 | } catch (IOException e) {
180 | e.printStackTrace();
181 | }
182 | request.releaseConnection();
183 | response.close();
184 | return webPage.toString();
185 | }
186 | /**
187 | * 设置request请求参数
188 | * @param request
189 | * @param params
190 | */
191 | public static void setHttpPostParams(HttpPost request,Map params) throws UnsupportedEncodingException {
192 | List formParams = new ArrayList();
193 | for (String key : params.keySet()) {
194 | formParams.add(new BasicNameValuePair(key,params.get(key)));
195 | }
196 | UrlEncodedFormEntity entity = new UrlEncodedFormEntity(formParams, "utf-8");
197 | request.setEntity(entity);
198 | }
199 | }
200 |
--------------------------------------------------------------------------------
/www.dianping.com/src/test/resources/test.html:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
--------------------------------------------------------------------------------
/www.dianping.com/src/main/java/com/github/wycm/DianpingCrawler.java:
--------------------------------------------------------------------------------
1 | package com.github.wycm;
2 |
3 | import org.apache.http.client.methods.HttpGet;
4 | import org.apache.http.impl.client.CloseableHttpClient;
5 | import org.apache.http.impl.client.HttpClients;
6 | import org.apache.http.util.EntityUtils;
7 | import org.jsoup.Jsoup;
8 | import org.jsoup.nodes.Document;
9 | import org.jsoup.nodes.Element;
10 |
11 | import java.io.IOException;
12 | import java.util.*;
13 | import java.util.regex.Matcher;
14 | import java.util.regex.Pattern;
15 | import java.util.stream.Collectors;
16 |
17 | /**
18 | * 美团点评字体反爬
19 | */
20 | public class DianpingCrawler {
21 | public static void main(String[] args) throws IOException {
22 | getContent("http://www.dianping.com/shop/96231053");
23 | }
24 | private static void getContent(String detailUrl) throws IOException {
25 | CloseableHttpClient httpClient = HttpClients
26 | .custom()
27 | .setUserAgent("Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.102 Safari/537.36")
28 | .build();
29 | String originalContent = EntityUtils.toString(httpClient.execute(new HttpGet(detailUrl)).getEntity());
30 | Document document= Jsoup.parse(originalContent);
31 | String cssUrl = "http:" + document.select("link[href*=svgtextcss]").first().attr("href");
32 | String cssResponse = Jsoup.connect(cssUrl).execute().body();
33 | // System.out.println(cssResponse);
34 | Pattern pattern = Pattern.compile("class\\^=\"(.*?)\".*?url\\((.*?)\\)");
35 | Matcher matcher = pattern.matcher(cssResponse);
36 | Map urlMap = new HashMap<>();
37 | Map svgMap = new HashMap<>();
38 | while (matcher.find()){
39 | String prefix = matcher.group(1);
40 | String url = "http:" + matcher.group(2);
41 | urlMap.put(prefix, url);
42 | svgMap.put(prefix, EntityUtils.toString(httpClient.execute(new HttpGet(url)).getEntity(), "utf-8"));
43 | System.out.println(prefix);
44 | System.out.println(url);
45 | }
46 | pattern = Pattern.compile("\\.[a-z]{2}-.*?\\{.*?\\}");
47 | matcher = pattern.matcher(cssResponse);
48 | List cssList = new ArrayList<>();
49 | Pattern cssBackgroundPattern = Pattern.compile("(\\.([a-z]{2})-.*?)\\{background:(.*?)\\.0px (.*?)\\.0px");
50 | Matcher cssBackgroundMatch;
51 | while (matcher.find()){
52 | cssBackgroundMatch = cssBackgroundPattern.matcher(matcher.group(0));
53 | if (cssBackgroundMatch.find()){
54 | cssList.add(new CssBackground(cssBackgroundMatch.group(1), Integer.valueOf(cssBackgroundMatch.group(3)), Integer.valueOf(cssBackgroundMatch.group(4))));
55 | }
56 | }
57 | //对css分组排序
58 | cssList.sort((c1, c2) ->{
59 | int i = c1.getClassName().substring(0, 3).compareTo(c2.getClassName().substring(0, 3));
60 | if (i != 0){
61 | return i;
62 | } else {
63 | i = c2.getY().compareTo(c1.getY());;
64 | if (i != 0){
65 | return i;
66 | } else {
67 | return c2.getX().compareTo(c1.getX());
68 | }
69 | }
70 | });
71 | cssList.forEach(System.out::println);
72 | int xIndex = 0;
73 | int yIndex = 0;
74 | CssBackground lastCssBackground = null;
75 | //计算对应字体的坐标
76 | for(CssBackground c : cssList){
77 | if (lastCssBackground == null){
78 | lastCssBackground = c;
79 | continue;
80 | } else {
81 | if (!c.getClassName().substring(0, 3).equals(lastCssBackground.getClassName().substring(0, 3))){
82 | xIndex = 0;
83 | yIndex = 0;
84 | } else if (!c.getX().equals(lastCssBackground.getX()) && c.getY().equals(lastCssBackground.getY())){
85 | c.setxIndex(++xIndex);
86 | c.setyIndex(yIndex);
87 | } else if (c.getX().equals(lastCssBackground.getX()) && !c.getY().equals(lastCssBackground.getY())){
88 | c.setxIndex(xIndex);
89 | c.setyIndex(++yIndex);
90 | } else if (!c.getX().equals(lastCssBackground.getX()) && !c.getY().equals(lastCssBackground.getY())){
91 | xIndex = 0;
92 | c.setxIndex(xIndex);
93 | c.setyIndex(++yIndex);
94 | }
95 | lastCssBackground = c;
96 | }
97 | }
98 | Map cacheDocumentMap = new HashMap<>();
99 | Map cssBackgroundMap = new HashMap<>();
100 | String lastPrefix = "";
101 | cssList.stream().map(c -> {
102 | c.setSvgResponse(svgMap.get(c.getClassName().substring(1, 4)));
103 | if (!cacheDocumentMap.containsKey(c.getClassName().substring(0, 3))){
104 | cacheDocumentMap.put(c.getClassName().substring(0, 3), Jsoup.parse(c.getSvgResponse()));
105 | }
106 | c.setDocument(cacheDocumentMap.get(c.getClassName().substring(0, 3)));
107 | Document doc = c.getDocument();
108 | Element e = null;
109 | if ((c.getX() == -6 && c.getY() == -6) || (c.getX() % -12 == -7 && c.getY() == -6)){
110 | e = doc.select("text").first();
111 | } else if ((c.getX() == -7 && c.getY() == -7) || (c.getX() % 14 == -8 && c.getY() == -7)){
112 | e = doc.select("text").first();
113 | } else if (c.getX() % 6 == -1 && c.getY() == -6){
114 | e = doc.select("text").first();
115 | } else if (c.getX() % -12 == 0 && c.getY() % -30 == -6){
116 | e = doc.select("textPath[xlink:href='#" + (c.getyIndex() + 1) + "']").first();
117 | } else if (c.getX() % -14 == 0 && c.getY() % -30 == -7){
118 | e = doc.select("textPath[xlink:href='#" + (c.getyIndex() + 1) + "']").first();
119 | }
120 | if (c == null){
121 | //为上一个
122 | //CssBackground{className='.hy-GijB', x=-7, y=-6, xIndex=0, yIndex=0, actualFont='null'}
123 | //CssBackground{className='.hy-o8Bu', x=-19, y=-6, xIndex=0, yIndex=0, actualFont='null'}
124 | //CssBackground{className='.hy-7IxC', x=-31, y=-6, xIndex=0, yIndex=0, actualFont='null'}
125 | //CssBackground{className='.hy-8zQE', x=-43, y=-6, xIndex=0, yIndex=0, actualFont='null'}
126 | //CssBackground{className='.hy-PrgG', x=-55, y=-6, xIndex=0, yIndex=0, actualFont='null'}
127 | //CssBackground{className='.hy-Qbc8', x=-67, y=-6, xIndex=0, yIndex=0, actualFont='null'}
128 | //CssBackground{className='.hy-TnVD', x=-79, y=-6, xIndex=0, yIndex=0, actualFont='null'}
129 | //CssBackground{className='.hy-TqUO', x=-91, y=-6, xIndex=0, yIndex=0, actualFont='null'}
130 | //CssBackground{className='.hy-UkCG', x=-103, y=-6, xIndex=0, yIndex=0, actualFont='null'}
131 | //CssBackground{className='.hy-yOPP', x=-114, y=-6, xIndex=0, yIndex=0, actualFont='null'}
132 | //todo最后一个不满足规则
133 | }
134 | String text = e.text();
135 | c.setActualFont(text.substring(c.getxIndex(), c.getxIndex() + 1));
136 | cssBackgroundMap.put(c.getClassName().substring(1, c.getClassName().length()), c);
137 | return c;
138 | }).collect(Collectors.toList());
139 | //还原网页
140 | Pattern spanPattern = Pattern.compile("");
141 | Matcher contentMatcher = spanPattern.matcher(originalContent);
142 | while (contentMatcher.find()){
143 | String s1 = contentMatcher.group(0);
144 | String s2 = cssBackgroundMap.get(contentMatcher.group(1)).getActualFont();
145 | originalContent = originalContent.replace(s1, s2);
146 | }
147 | System.out.println(originalContent);
148 | }
149 | static class CssBackground{
150 | private String className;
151 | private Integer x;
152 | private Integer y;
153 | private int xIndex;
154 | private int yIndex;
155 | private String svgResponse;
156 | private String actualFont;
157 | private Document document;
158 |
159 | public CssBackground(String className, int x, int y) {
160 | this.className = className;
161 | this.x = x;
162 | this.y = y;
163 | }
164 |
165 | public String getClassName() {
166 | return className;
167 | }
168 |
169 | public void setClassName(String className) {
170 | this.className = className;
171 | }
172 |
173 | public Integer getX() {
174 | return x;
175 | }
176 |
177 | public void setX(Integer x) {
178 | this.x = x;
179 | }
180 |
181 | public Integer getY() {
182 | return y;
183 | }
184 |
185 | public void setY(Integer y) {
186 | this.y = y;
187 | }
188 |
189 | public int getxIndex() {
190 | return xIndex;
191 | }
192 |
193 | public void setxIndex(int xIndex) {
194 | this.xIndex = xIndex;
195 | }
196 |
197 | public int getyIndex() {
198 | return yIndex;
199 | }
200 |
201 | public void setyIndex(int yIndex) {
202 | this.yIndex = yIndex;
203 | }
204 |
205 | public String getSvgResponse() {
206 | return svgResponse;
207 | }
208 |
209 | public void setSvgResponse(String svgResponse) {
210 | this.svgResponse = svgResponse;
211 | }
212 |
213 | public String getActualFont() {
214 | return actualFont;
215 | }
216 |
217 | public void setActualFont(String actualFont) {
218 | this.actualFont = actualFont;
219 | }
220 |
221 | public Document getDocument() {
222 | return document;
223 | }
224 |
225 | public void setDocument(Document document) {
226 | this.document = document;
227 | }
228 |
229 | @Override
230 | public String toString() {
231 | return "CssBackground{" +
232 | "className='" + className + '\'' +
233 | ", x=" + x +
234 | ", y=" + y +
235 | ", xIndex=" + xIndex +
236 | ", yIndex=" + yIndex +
237 | ", actualFont='" + actualFont + '\'' +
238 | '}';
239 | }
240 | }
241 | }
--------------------------------------------------------------------------------