├── README.md
├── weibo
    ├── README.md
    ├── src
    │   └── main
    │   │   └── java
    │   │       └── com
    │   │           └── github
    │   │               └── wycm
    │   │                   ├── WeiboReplyCrawler.java
    │   │                   ├── WeiboCrawler.java
    │   │                   └── HttpClientUtil.java
    └── pom.xml
├── www.zhihu.com
    ├── README.md
    ├── pom.xml
    └── src
    │   └── main
    │       └── java
    │           └── com
    │               └── github
    │                   └── wycm
    │                       └── ZhihuCrawler.java
├── music163
    ├── .DS_Store
    ├── .gitignore
    ├── src
    │   ├── test
    │   │   └── java
    │   │   │   └── com
    │   │   │       └── github
    │   │   │           └── wycm
    │   │   │               └── AppTest.java
    │   └── main
    │   │   ├── resources
    │   │       ├── log4j-dev.properties
    │   │       ├── log4j-prod.properties
    │   │       └── log4j.properties
    │   │   └── java
    │   │       └── com
    │   │           └── github
    │   │               └── wycm
    │   │                   └── Music163.java
    ├── server-auto-deploy.sh
    └── pom.xml
├── www.douban.com
    ├── README.md
    ├── pom.xml
    └── src
    │   └── main
    │       └── java
    │           └── com
    │               └── github
    │                   └── wycm
    │                       └── DoubanCrawler.java
├── .gitignore
├── www.dianping.com
    ├── README.md
    ├── pom.xml
    └── src
    │   ├── test
    │       ├── java
    │       │   └── com
    │       │   │   └── github
    │       │   │       └── wycm
    │       │   │           └── DianpingCrawlerTest.java
    │       └── resources
    │       │   └── test.html
    │   └── main
    │       └── java
    │           └── com
    │               └── github
    │                   └── wycm
    │                       └── DianpingCrawler.java
└── www.goubanjia.com
    ├── pom.xml
    ├── src
        └── main
        │   └── java
        │       └── com
        │           └── github
        │               └── wycm
        │                   └── GoubanjiaCrawler.java
    └── README.md


/README.md:
--------------------------------------------------------------------------------
1 | crawler-set
2 | ===========
3 | * 各种网站爬虫合集，持续更新中....


--------------------------------------------------------------------------------
/weibo/README.md:
--------------------------------------------------------------------------------
1 | crawler-set
2 | ===========
3 | 
4 | ## 使用selenium抓取豆瓣图书搜索动态页面


--------------------------------------------------------------------------------
/www.zhihu.com/README.md:
--------------------------------------------------------------------------------
1 | crawler-set
2 | ===========
3 | ## 通过浏览器cookie获取知乎首页的简单demo


--------------------------------------------------------------------------------
/music163/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/wycm/crawler-set/HEAD/music163/.DS_Store


--------------------------------------------------------------------------------
/www.douban.com/README.md:
--------------------------------------------------------------------------------
1 | crawler-set
2 | ===========
3 | 
4 | ## 使用selenium抓取豆瓣图书搜索动态页面


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | # idea
2 | .idea/
3 | *.iml
4 | 
5 | target/
6 | .classpath
7 | .project
8 | .settings/
9 | 


--------------------------------------------------------------------------------
/music163/.gitignore:
--------------------------------------------------------------------------------
 1 | # idea
 2 | .idea/
 3 | *.iml
 4 | 
 5 | target/
 6 | .classpath
 7 | .project
 8 | .settings/
 9 | 
10 | zhihucookies
11 | proxies
12 | zhiHuYZM.gif


--------------------------------------------------------------------------------
/www.dianping.com/README.md:
--------------------------------------------------------------------------------
1 | ## 摘要
2 | * 大众点评字体反爬分析解决
3 | * 待更新
4 | 
5 | >![](http://upload-images.jianshu.io/upload_images/5830895-6a8b96dde229c26c.png?imageMogr2/auto-orient/strip%7CimageView2/2/w/1240)
6 | <br>一个程序员日常分享，包括但不限于爬虫、Java后端技术，欢迎关注。


--------------------------------------------------------------------------------
/weibo/src/main/java/com/github/wycm/WeiboReplyCrawler.java:
--------------------------------------------------------------------------------
 1 | package com.github.wycm;
 2 | 
 3 | import org.apache.http.impl.cookie.BasicClientCookie;
 4 | 
 5 | import java.util.Calendar;
 6 | 
 7 | public class WeiboReplyCrawler {
 8 | 
 9 | }
10 | 


--------------------------------------------------------------------------------
/music163/src/test/java/com/github/wycm/AppTest.java:
--------------------------------------------------------------------------------
 1 | package com.github.wycm;
 2 | 
 3 | import static org.junit.Assert.assertTrue;
 4 | 
 5 | import org.junit.Test;
 6 | 
 7 | /**
 8 |  * Unit test for simple App.
 9 |  */
10 | public class AppTest 
11 | {
12 |     /**
13 |      * Rigorous Test :-)
14 |      */
15 |     @Test
16 |     public void shouldAnswerWithTrue()
17 |     {
18 |         assertTrue( true );
19 |     }
20 | }
21 | 


--------------------------------------------------------------------------------
/music163/server-auto-deploy.sh:
--------------------------------------------------------------------------------
 1 | sudo ps -ef |grep music163 | grep -v grep |awk '{print $2}'|xargs kill -9
 2 | sudo ps -ef |grep chrome | grep -v grep |awk '{print $2}'|xargs kill -9
 3 | 
 4 | cd /alidata/server/workspace/music163
 5 | git pull origin master
 6 | echo 'pull success'
 7 | mvn clean
 8 | mvn -Pprod package assembly:single
 9 | echo 'mvn success'
10 | export DISPLAY=:99
11 | cd target
12 | java -jar music163-1.0-SNAPSHOT-jar-with-dependencies.jar &
13 | 


--------------------------------------------------------------------------------
/music163/src/main/resources/log4j-dev.properties:
--------------------------------------------------------------------------------
 1 | log4j.rootLogger=INFO, stdout
 2 | 
 3 | log4j.appender.stdout=org.apache.log4j.ConsoleAppender
 4 | log4j.appender.stdout.Threshold=INFO
 5 | log4j.appender.stdout.layout=org.apache.log4j.PatternLayout
 6 | log4j.appender.stdout.layout.ConversionPattern=%d %p [%x,%t] - [%c] - %m%n
 7 | 
 8 | log4j.appender.A1=org.apache.log4j.DailyRollingFileAppender
 9 | log4j.appender.A1.Threshold=INFO
10 | log4j.appender.A1.layout=org.apache.log4j.PatternLayout
11 | log4j.appender.A1.layout.ConversionPattern=%d %p [%x,%t] - [%c] - <%m>%n
12 | log4j.appender.A1.DatePattern='.'yyyyMMdd
13 | log4j.appender.A1.File=d:/log/all.log
14 | 


--------------------------------------------------------------------------------
/music163/src/main/resources/log4j-prod.properties:
--------------------------------------------------------------------------------
 1 | log4j.rootLogger=INFO, A1
 2 | 
 3 | log4j.appender.stdout=org.apache.log4j.ConsoleAppender
 4 | log4j.appender.stdout.Threshold=INFO
 5 | log4j.appender.stdout.layout=org.apache.log4j.PatternLayout
 6 | log4j.appender.stdout.layout.ConversionPattern=%d %p [%x,%t] - [%c] - %m%n
 7 | 
 8 | log4j.appender.A1=org.apache.log4j.DailyRollingFileAppender
 9 | log4j.appender.A1.Threshold=INFO
10 | log4j.appender.A1.layout=org.apache.log4j.PatternLayout
11 | log4j.appender.A1.layout.ConversionPattern=%d %p [%x,%t] - [%c] - <%m>%n
12 | log4j.appender.A1.DatePattern='.'yyyyMMdd
13 | log4j.appender.A1.File=/alidata/log/music163.log
14 | 


--------------------------------------------------------------------------------
/music163/src/main/resources/log4j.properties:
--------------------------------------------------------------------------------
 1 | log4j.rootLogger=${log4j.rootLogger}
 2 | 
 3 | log4j.appender.stdout=${log4j.appender.stdout}
 4 | log4j.appender.stdout.Threshold=${log4j.appender.stdout.Threshold}
 5 | log4j.appender.stdout.layout=${log4j.appender.stdout.layout}
 6 | log4j.appender.stdout.layout.ConversionPattern=${log4j.appender.stdout.layout.ConversionPattern}
 7 | 
 8 | log4j.appender.A1=${log4j.appender.A1}
 9 | log4j.appender.A1.Threshold=${log4j.appender.A1.Threshold}
10 | log4j.appender.A1.layout=${log4j.appender.A1.layout}
11 | log4j.appender.A1.layout.ConversionPattern=${log4j.appender.A1.layout.ConversionPattern}
12 | log4j.appender.A1.DatePattern=${log4j.appender.A1.DatePattern}
13 | log4j.appender.A1.File=${log4j.appender.A1.File}


--------------------------------------------------------------------------------
/www.goubanjia.com/pom.xml:
--------------------------------------------------------------------------------
 1 | <project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
 2 |   xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/maven-v4_0_0.xsd">
 3 |   <modelVersion>4.0.0</modelVersion>
 4 |   <groupId>com.github.wycm</groupId>
 5 |   <artifactId>goubanjia</artifactId>
 6 |   <packaging>jar</packaging>
 7 |   <version>1.0-SNAPSHOT</version>
 8 |   <name>goubanjia-crawler</name>
 9 |   <url>http://maven.apache.org</url>
10 | 
11 |   <dependencies>
12 |     <!-- https://mvnrepository.com/artifact/org.jsoup/jsoup -->
13 |     <dependency>
14 |       <groupId>org.jsoup</groupId>
15 |       <artifactId>jsoup</artifactId>
16 |       <version>1.10.2</version>
17 |     </dependency>
18 | 
19 |   </dependencies>
20 |   <build>
21 |     <plugins>
22 |       <plugin>
23 |         <groupId>org.apache.maven.plugins</groupId>
24 |         <artifactId>maven-compiler-plugin</artifactId>
25 |         <version>2.0.2</version>
26 |         <configuration>
27 |           <source>1.7</source>
28 |           <target>1.7</target>
29 |         </configuration>
30 |       </plugin>
31 |     </plugins>
32 |   </build>
33 | </project>
34 | 


--------------------------------------------------------------------------------
/www.zhihu.com/pom.xml:
--------------------------------------------------------------------------------
 1 | <project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
 2 |   xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/maven-v4_0_0.xsd">
 3 |   <modelVersion>4.0.0</modelVersion>
 4 |   <groupId>com.github.wycm</groupId>
 5 |   <artifactId>zhihu</artifactId>
 6 |   <packaging>jar</packaging>
 7 |   <version>1.0-SNAPSHOT</version>
 8 |   <name>zhihu-crawler</name>
 9 |   <url>http://maven.apache.org</url>
10 | 
11 |   <dependencies>
12 |     <!-- https://mvnrepository.com/artifact/org.jsoup/jsoup -->
13 |     <dependency>
14 |       <groupId>org.jsoup</groupId>
15 |       <artifactId>jsoup</artifactId>
16 |       <version>1.10.2</version>
17 |     </dependency>
18 | 
19 |     <dependency>
20 |       <groupId>org.apache.httpcomponents</groupId>
21 |       <artifactId>httpclient</artifactId>
22 |       <version>4.5</version>
23 |     </dependency>
24 | 
25 |   </dependencies>
26 |   <build>
27 |     <plugins>
28 |       <plugin>
29 |         <groupId>org.apache.maven.plugins</groupId>
30 |         <artifactId>maven-compiler-plugin</artifactId>
31 |         <version>2.0.2</version>
32 |         <configuration>
33 |           <source>1.7</source>
34 |           <target>1.7</target>
35 |         </configuration>
36 |       </plugin>
37 |     </plugins>
38 |   </build>
39 | </project>
40 | 


--------------------------------------------------------------------------------
/www.douban.com/pom.xml:
--------------------------------------------------------------------------------
 1 | <project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
 2 |   xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/maven-v4_0_0.xsd">
 3 |   <modelVersion>4.0.0</modelVersion>
 4 |   <groupId>com.github.wycm</groupId>
 5 |   <artifactId>douban</artifactId>
 6 |   <packaging>jar</packaging>
 7 |   <version>1.0-SNAPSHOT</version>
 8 |   <name>selenium-geetest-crack</name>
 9 |   <url>http://maven.apache.org</url>
10 | 
11 |   <dependencies>
12 |     <dependency>
13 |       <groupId>org.seleniumhq.selenium</groupId>
14 |       <artifactId>selenium-server</artifactId>
15 |       <version>3.0.1</version>
16 |     </dependency>
17 |     <!-- https://mvnrepository.com/artifact/org.jsoup/jsoup -->
18 |     <dependency>
19 |       <groupId>org.jsoup</groupId>
20 |       <artifactId>jsoup</artifactId>
21 |       <version>1.7.2</version>
22 |     </dependency>
23 | 
24 |   </dependencies>
25 |   <build>
26 |     <plugins>
27 |       <plugin>
28 |         <groupId>org.apache.maven.plugins</groupId>
29 |         <artifactId>maven-compiler-plugin</artifactId>
30 |         <version>2.0.2</version>
31 |         <configuration>
32 |           <source>1.7</source>
33 |           <target>1.7</target>
34 |         </configuration>
35 |       </plugin>
36 |     </plugins>
37 |   </build>
38 | </project>
39 | 


--------------------------------------------------------------------------------
/www.dianping.com/pom.xml:
--------------------------------------------------------------------------------
 1 | <project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
 2 |   xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/maven-v4_0_0.xsd">
 3 |   <modelVersion>4.0.0</modelVersion>
 4 |   <groupId>com.github.wycm</groupId>
 5 |   <artifactId>dianping</artifactId>
 6 |   <packaging>jar</packaging>
 7 |   <version>1.0-SNAPSHOT</version>
 8 |   <name>dianping-crawler</name>
 9 |   <url>http://maven.apache.org</url>
10 | 
11 |   <dependencies>
12 |     <dependency>
13 |       <groupId>org.jsoup</groupId>
14 |       <artifactId>jsoup</artifactId>
15 |       <version>1.10.2</version>
16 |     </dependency>
17 |     <dependency>
18 |       <groupId>org.apache.httpcomponents</groupId>
19 |       <artifactId>httpclient</artifactId>
20 |       <version>4.5</version>
21 |     </dependency>
22 |     <dependency>
23 |       <groupId>junit</groupId>
24 |       <artifactId>junit</artifactId>
25 |       <version>4.12</version>
26 |     </dependency>
27 |   </dependencies>
28 |   <build>
29 |     <plugins>
30 |       <plugin>
31 |         <groupId>org.apache.maven.plugins</groupId>
32 |         <artifactId>maven-compiler-plugin</artifactId>
33 |         <version>2.0.2</version>
34 |         <configuration>
35 |           <source>1.8</source>
36 |           <target>1.8</target>
37 |         </configuration>
38 |       </plugin>
39 |     </plugins>
40 |   </build>
41 | </project>
42 | 


--------------------------------------------------------------------------------
/www.dianping.com/src/test/java/com/github/wycm/DianpingCrawlerTest.java:
--------------------------------------------------------------------------------
 1 | package com.github.wycm;
 2 | 
 3 | import org.apache.http.client.methods.CloseableHttpResponse;
 4 | import org.apache.http.client.methods.HttpGet;
 5 | import org.apache.http.impl.client.HttpClients;
 6 | import org.apache.http.util.EntityUtils;
 7 | import org.jsoup.Jsoup;
 8 | import org.jsoup.select.Elements;
 9 | import org.junit.Test;
10 | 
11 | import java.io.BufferedReader;
12 | import java.io.IOException;
13 | import java.io.InputStreamReader;
14 | import java.util.stream.Collectors;
15 | 
16 | import static org.junit.Assert.*;
17 | 
18 | /**
19 |  * Created by wycm on 2018/11/19.
20 |  */
21 | public class DianpingCrawlerTest {
22 |     @Test
23 |     public void testJsoup(){
24 |         String s = new BufferedReader(new InputStreamReader(this.getClass().getResourceAsStream("/test.html"))).lines()
25 |                 .parallel().collect(Collectors.joining("\n"));
26 |         Elements elements = Jsoup.parse(s).select("textPath[xlink:href='#1']");
27 |         System.out.println(elements);
28 |     }
29 |     @Test
30 |     public void testHttp() throws IOException {
31 |         CloseableHttpResponse response = HttpClients.createDefault().execute(new HttpGet("http://s3plus.meituan.net/v1/mss_0a06a471f9514fc79c981b5466f56b91/svgtextcss/807789f715a7caed8e7c2475dcf94e20.svg"));
32 |         System.out.println(EntityUtils.toString(response.getEntity(), "utf-8"));
33 |     }
34 | }


--------------------------------------------------------------------------------
/weibo/pom.xml:
--------------------------------------------------------------------------------
 1 | <project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
 2 |   xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/maven-v4_0_0.xsd">
 3 |   <modelVersion>4.0.0</modelVersion>
 4 |   <groupId>com.github.wycm</groupId>
 5 |   <artifactId>weibo</artifactId>
 6 |   <packaging>jar</packaging>
 7 |   <version>1.0-SNAPSHOT</version>
 8 |   <name>selenium-geetest-crack</name>
 9 |   <url>http://maven.apache.org</url>
10 | 
11 |   <dependencies>
12 |     <dependency>
13 |       <groupId>org.jsoup</groupId>
14 |       <artifactId>jsoup</artifactId>
15 |       <version>1.7.2</version>
16 |     </dependency>
17 |     <dependency>
18 |       <groupId>org.apache.httpcomponents</groupId>
19 |       <artifactId>httpclient</artifactId>
20 |       <version>4.5</version>
21 |     </dependency>
22 |     <dependency>
23 |       <groupId>log4j</groupId>
24 |       <artifactId>log4j</artifactId>
25 |       <version>1.2.15</version>
26 |     </dependency>
27 |     <dependency>
28 |       <groupId>com.jayway.jsonpath</groupId>
29 |       <artifactId>json-path</artifactId>
30 |       <version>2.2.0</version>
31 |     </dependency>
32 |   </dependencies>
33 |   <build>
34 |     <plugins>
35 |       <plugin>
36 |         <groupId>org.apache.maven.plugins</groupId>
37 |         <artifactId>maven-compiler-plugin</artifactId>
38 |         <version>2.0.2</version>
39 |         <configuration>
40 |           <source>1.7</source>
41 |           <target>1.7</target>
42 |         </configuration>
43 |       </plugin>
44 |     </plugins>
45 |   </build>
46 | </project>
47 | 


--------------------------------------------------------------------------------
/www.douban.com/src/main/java/com/github/wycm/DoubanCrawler.java:
--------------------------------------------------------------------------------
 1 | package com.github.wycm;
 2 | 
 3 | import org.jsoup.Jsoup;
 4 | import org.jsoup.nodes.Document;
 5 | import org.jsoup.nodes.Element;
 6 | import org.jsoup.select.Elements;
 7 | import org.openqa.selenium.By;
 8 | import org.openqa.selenium.WebDriver;
 9 | import org.openqa.selenium.WebElement;
10 | import org.openqa.selenium.chrome.ChromeDriver;
11 | import org.openqa.selenium.support.ui.ExpectedCondition;
12 | import org.openqa.selenium.support.ui.WebDriverWait;
13 | 
14 | /**
15 |  * 豆瓣selenium爬虫
16 |  * 运行需要下载chromedirever,并修改代码中的chromedirver地址
17 |  */
18 | public class DoubanCrawler {
19 |     private static WebDriver driver;
20 |     static {
21 |         System.setProperty("webdriver.chrome.driver", "D:/dev/selenium/chromedriver_V2.30/chromedriver_win32/chromedriver.exe");
22 |         driver = new ChromeDriver();
23 |     }
24 |     public static void main(String[] args) throws InterruptedException {
25 |         douban();
26 |         driver.quit();
27 |     }
28 |     private static void douban(){
29 |         driver.get("https://book.douban.com/subject_search?search_text=%E4%BA%92%E8%81%94%E7%BD%91&cat=1001");
30 |         By by = By.cssSelector("div[id='root']");
31 |         waitForLoad(driver, by);
32 |         String pageSource = driver.getPageSource();
33 |         Document document = Jsoup.parse(pageSource);
34 |         Elements elements = document.select("a[class=title-text]");
35 |         for(Element element: elements){
36 |             System.out.println(element.text());
37 |         }
38 |     }
39 |     /**
40 |      * 等待元素加载，10s超时
41 |      * @param driver
42 |      * @param by
43 |      */
44 |     public static void waitForLoad(final WebDriver driver, final By by){
45 |         new WebDriverWait(driver, 10).until(new ExpectedCondition<Boolean>() {
46 |             public Boolean apply(WebDriver d) {
47 |                 WebElement element = driver.findElement(by);
48 |                 if (element != null){
49 |                     return true;
50 |                 }
51 |                 return false;
52 |             }
53 |         });
54 |     }
55 | }
56 | 


--------------------------------------------------------------------------------
/www.goubanjia.com/src/main/java/com/github/wycm/GoubanjiaCrawler.java:
--------------------------------------------------------------------------------
 1 | package com.github.wycm;
 2 | 
 3 | import org.jsoup.Jsoup;
 4 | import org.jsoup.nodes.Document;
 5 | import org.jsoup.nodes.Element;
 6 | import org.jsoup.select.Elements;
 7 | 
 8 | import java.io.IOException;
 9 | import java.net.URL;
10 | 
11 | public class GoubanjiaCrawler {
12 |     public static void main(String[] args) throws IOException {
13 |         Document document= Jsoup.parse(new URL("http://www.goubanjia.com/"), 10000);
14 |         setPort(document);
15 |         //获取class='table'的table的所有子节点tr
16 |         Elements elements = document.select("table[class=table] tr");
17 |         for (int i = 1; i < elements.size(); i++){
18 |             //获取td节点
19 |             Element td = elements.get(i).select("td").first();
20 |             /**
21 |              * 查找所有style属性包含none字符串的标签（页面上未显示的标签），并移除
22 |              * 包括以下两种
23 |              * style=display: none;
24 |              * style=display:none;
25 |              */
26 |             for(Element none : td.select("[style*=none;]")){
27 |                 none.remove();
28 |             }
29 |             //移除空格
30 |             String ipPort = td.text().replaceAll(" ", "");
31 |             //打印
32 |             System.out.println(ipPort);
33 |         }
34 |     }
35 | 
36 |     /**
37 |      * js代码port还原
38 |      * @param doc
39 |      */
40 |     private static void setPort(Document doc){
41 |         for (Element e : doc.select(".port")){//$('.port').each(function() {
42 |             String a = e.text();//var a = $(this).html();
43 |             if(a.indexOf("*") != -0x1){//if (a.indexOf('*') != -0x1) {
44 |                 return;
45 |             }
46 |             String b = e.attr("class");//var b = $(this).attr('class');
47 |             b = b.split(" ")[0x1];//b = (b.split(" "))[0x1];
48 |             String[] c = b.split("");//var c = b.split("");
49 |             int d = b.length();//var d = c.length;
50 |             StringBuilder f = new StringBuilder();//var f = [];
51 |             for(int g = 0x0; g < d; g++){//for (var g = 0x0; g < d; g++) {
52 |                 f.append("ABCDEFGHIZ".indexOf(c[g]));//f.push('ABCDEFGHIZ'.indexOf(c[g]))
53 |             }
54 |             e.text(String.valueOf(Integer.valueOf(f.toString()) >> 0x3));//$(this).html(window.parseInt(f.join('')) >> 0x3)
55 |         }
56 |     }
57 | }


--------------------------------------------------------------------------------
/www.zhihu.com/src/main/java/com/github/wycm/ZhihuCrawler.java:
--------------------------------------------------------------------------------
 1 | package com.github.wycm;
 2 | 
 3 | import org.apache.http.client.CookieStore;
 4 | import org.apache.http.client.methods.CloseableHttpResponse;
 5 | import org.apache.http.client.methods.HttpGet;
 6 | import org.apache.http.client.protocol.HttpClientContext;
 7 | import org.apache.http.impl.client.BasicCookieStore;
 8 | import org.apache.http.impl.client.CloseableHttpClient;
 9 | import org.apache.http.impl.client.HttpClients;
10 | import org.apache.http.impl.client.LaxRedirectStrategy;
11 | import org.apache.http.impl.cookie.BasicClientCookie;
12 | import org.apache.http.util.EntityUtils;
13 | import org.jsoup.Jsoup;
14 | import org.jsoup.nodes.Document;
15 | import org.jsoup.nodes.Element;
16 | import org.jsoup.select.Elements;
17 | 
18 | import java.io.IOException;
19 | import java.net.URL;
20 | import java.util.Calendar;
21 | 
22 | public class ZhihuCrawler {
23 |     private final static String RAW_COOKIES = "拷贝浏览器知乎cookie至此";
24 |     private static final String userAgent = "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/57.0.2987.133 Safari/537.36";
25 |     private static final CloseableHttpClient httpClient = HttpClients.custom()
26 |             .setUserAgent(userAgent)
27 |             //设置post默认重定向
28 |             .setRedirectStrategy(new LaxRedirectStrategy())
29 |             .build();
30 |     private static final HttpClientContext httpClientContext = HttpClientContext.create();
31 |     private static CookieStore cookieStore = new BasicCookieStore();
32 | 
33 | 
34 |     static {
35 |         for (String rawCookie : RAW_COOKIES.split("; ")){
36 |             String[] s = rawCookie.split("=");
37 |             BasicClientCookie cookie = new BasicClientCookie(s[0], s[1]);
38 |             cookie.setDomain("zhihu.com");
39 |             cookie.setPath("/");
40 |             cookie.setSecure(false);
41 |             cookie.setAttribute("domain", "zhihu.com");
42 |             Calendar calendar = Calendar.getInstance();
43 |             calendar.add(Calendar.DAY_OF_MONTH, +5);
44 |             cookie.setExpiryDate(calendar.getTime());
45 |             cookieStore.addCookie(cookie);
46 |         }
47 |         httpClientContext.setCookieStore(cookieStore);
48 |     }
49 |     public static void main(String[] args) throws IOException {
50 |         HttpGet httpGet = new HttpGet("https://www.zhihu.com");
51 |         CloseableHttpResponse response = httpClient.execute(httpGet, httpClientContext);
52 |         String s = EntityUtils.toString(response.getEntity());
53 |         Document document = Jsoup.parse(s);
54 |         Elements elements = document.select("div[class=ContentItem AnswerItem]");
55 |         for (Element e : elements){
56 | //            System.out.println(e);
57 |             System.out.println(e.select("meta[itemprop=url]").first().attr("content"));
58 |             System.out.println(e.select("meta[itemprop=name]").first().attr("content"));
59 |         }
60 |     }
61 | }


--------------------------------------------------------------------------------
/weibo/src/main/java/com/github/wycm/WeiboCrawler.java:
--------------------------------------------------------------------------------
 1 | package com.github.wycm;
 2 | 
 3 | import com.jayway.jsonpath.DocumentContext;
 4 | import com.jayway.jsonpath.JsonPath;
 5 | import org.apache.http.client.methods.HttpPost;
 6 | 
 7 | import java.io.IOException;
 8 | import java.net.URLEncoder;
 9 | import java.util.HashMap;
10 | import java.util.Map;
11 | import java.util.regex.Matcher;
12 | import java.util.regex.Pattern;
13 | 
14 | /**
15 |  * Created by wangyang on 2017/8/22.
16 |  */
17 | public class WeiboCrawler {
18 |     private static final String CHECK_URL = "https://login.sina.com.cn/sso/prelogin.php?checkpin=1&entry=mweibo&su=MTMwODgyODA4NjA=&callback=jsonpcallback1503386116934";
19 |     private static final String LOGIN_URL = "https://passport.weibo.cn/sso/login";
20 |     private static final String POST_ARGS = "username=13268037201&password=password&savestate=1&r=http%3A%2F%2Fm.weibo.cn%2F&ec=0&pagerefer=https%3A%2F%2Fpassport.weibo.cn%2Fsignin%2Fwelcome%3Fentry%3Dmweibo%26r%3Dhttp%253A%252F%252Fm.weibo.cn%252F&entry=mweibo&wentry=&loginfrom=&client_id=&code=&qq=&mainpageflag=1&hff=&hfp=";
21 |     private static final String KEYWORD_ARGS = "type=all&queryVal=${keyword}&luicode=10000011&lfid=106003type%3D1&title=${keyword}&containerid=100103type%3D1%26q%3D${keyword}";
22 |     /**
23 |      * 搜索url
24 |      */
25 |     public static void main(String[] args) throws IOException {
26 |         String searchUrl = "https://m.weibo.cn/api/container/getIndex";
27 |         String keyword = "联想电脑";
28 |         System.out.println(HttpClientUtil.get(CHECK_URL));
29 |         HttpPost post = new HttpPost(LOGIN_URL);
30 |         //该header必须要
31 |         post.addHeader("Referer", "https://passport.weibo.cn/signin/login?entry=mweibo&res=wel&wm=3349&r=http%3A%2F%2Fm.weibo.cn%2F");
32 |         HttpClientUtil.setHttpPostParams(post, queryToMap(POST_ARGS));
33 |         String res = HttpClientUtil.getWebPage(post);
34 |         System.out.println(res);
35 |         System.out.println(HttpClientUtil.get("https://m.weibo.cn/"));
36 | 
37 | 
38 |         String searchArgs = KEYWORD_ARGS.replaceAll("\\$\\{keyword\\}", URLEncoder.encode(keyword, "utf-8"));
39 |         searchUrl = searchUrl + "?" + searchArgs;
40 |         String searchRes = HttpClientUtil.get(searchUrl);
41 |         Pattern pattern = Pattern.compile("idstr\":\"(\\d+)\"");
42 |         Matcher matcher = pattern.matcher(searchRes);
43 |         while (matcher.find()){
44 |             String commentId = matcher.group(1);
45 |             System.out.println(commentId);
46 |         }
47 | //            String result = HttpClientUtil.get("https://m.weibo.cn/api/comments/show?id=4154417035431509&page=1");
48 | //            System.out.println(result);
49 |     }
50 |     /**
51 |      * returns the url parameters in a map
52 |      * @param query
53 |      * @return map
54 |      */
55 |     public static Map<String, String> queryToMap(String query){
56 |         if (query == null){
57 |             query = "";
58 |         }
59 |         Map<String, String> result = new HashMap<String, String>();
60 |         for (String param : query.split("&")) {
61 |             String pair[] = param.split("=");
62 |             if (pair.length>1) {
63 |                 result.put(pair[0], pair[1]);
64 |             }else{
65 |                 result.put(pair[0], "");
66 |             }
67 |         }
68 |         return result;
69 |     }
70 | }
71 | 


--------------------------------------------------------------------------------
/music163/pom.xml:
--------------------------------------------------------------------------------
  1 | <?xml version="1.0" encoding="UTF-8"?>
  2 | 
  3 | <project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
  4 |   xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
  5 |   <modelVersion>4.0.0</modelVersion>
  6 | 
  7 |   <groupId>com.github.wycm</groupId>
  8 |   <artifactId>music163</artifactId>
  9 |   <version>1.0-SNAPSHOT</version>
 10 | 
 11 |   <name>music163</name>
 12 |   <url>http://www.example.com</url>
 13 | 
 14 |   <properties>
 15 |     <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
 16 |     <maven.compiler.source>1.8</maven.compiler.source>
 17 |     <maven.compiler.target>1.8</maven.compiler.target>
 18 |   </properties>
 19 | 
 20 |   <dependencies>
 21 |     <dependency>
 22 |       <groupId>org.seleniumhq.selenium</groupId>
 23 |       <artifactId>selenium-server</artifactId>
 24 |       <version>3.0.1</version>
 25 |     </dependency>
 26 |     <dependency>
 27 |       <groupId>org.slf4j</groupId>
 28 |       <artifactId>slf4j-api</artifactId>
 29 |       <version>1.7.6</version>
 30 |     </dependency>
 31 |     <dependency>
 32 |       <groupId>org.slf4j</groupId>
 33 |       <artifactId>slf4j-log4j12</artifactId>
 34 |       <version>1.7.6</version>
 35 |     </dependency>
 36 |     <dependency>
 37 |       <groupId>junit</groupId>
 38 |       <artifactId>junit</artifactId>
 39 |       <version>4.11</version>
 40 |       <scope>test</scope>
 41 |     </dependency>
 42 |       <dependency>
 43 |           <groupId>redis.clients</groupId>
 44 |           <artifactId>jedis</artifactId>
 45 |           <version>2.5.2</version>
 46 |       </dependency>
 47 |   </dependencies>
 48 | 
 49 |   <build>
 50 |       <resources>
 51 |           <resource>
 52 |               <directory>src/main/resources</directory>
 53 |               <includes>
 54 |                   <include>**/*</include>
 55 |               </includes>
 56 |           </resource>
 57 |           <resource>
 58 |               <directory>src/main/resources</directory>
 59 |               <includes>
 60 |                   <include>log4j.properties</include>
 61 |               </includes>
 62 |               <filtering>true</filtering>
 63 |           </resource>
 64 | 
 65 |       </resources>
 66 |     <plugins>
 67 |     <!--
 68 |         直接通过:mvn package assembly:single，会生成一个with-dependcies.jar后缀的jar，可以直接执行
 69 |     -->
 70 |       <plugin>
 71 |         <groupId>org.apache.maven.plugins</groupId>
 72 |         <artifactId>maven-assembly-plugin</artifactId>
 73 |         <version>2.5.5</version>
 74 |         <configuration>
 75 |           <archive>
 76 |             <manifest>
 77 |               <mainClass>com.github.wycm.Music163</mainClass>
 78 |             </manifest>
 79 |           </archive>
 80 |           <descriptorRefs>
 81 |             <descriptorRef>jar-with-dependencies</descriptorRef>
 82 |           </descriptorRefs>
 83 |         </configuration>
 84 |       </plugin>
 85 | 
 86 |     </plugins>
 87 |   </build>
 88 |     <profiles>
 89 |         <profile>
 90 |             <id>dev</id>
 91 |             <build>
 92 |                 <filters>
 93 |                     <filter>src/main/resources/log4j-dev.properties</filter>
 94 |                 </filters>
 95 |             </build>
 96 |             <activation>
 97 |                 <activeByDefault>true</activeByDefault>
 98 |             </activation>
 99 |         </profile>
100 |         <profile>
101 |             <id>prod</id>
102 |             <build>
103 |                 <filters>
104 |                     <filter>src/main/resources/log4j-prod.properties</filter>
105 |                 </filters>
106 |             </build>
107 |         </profile>
108 |     </profiles>
109 | </project>
110 | 


--------------------------------------------------------------------------------
/music163/src/main/java/com/github/wycm/Music163.java:
--------------------------------------------------------------------------------
  1 | package com.github.wycm;
  2 | 
  3 | import org.openqa.selenium.*;
  4 | import org.openqa.selenium.chrome.ChromeDriver;
  5 | import org.openqa.selenium.chrome.ChromeOptions;
  6 | import org.slf4j.Logger;
  7 | import org.slf4j.LoggerFactory;
  8 | 
  9 | import java.util.*;
 10 | import java.util.concurrent.TimeUnit;
 11 | import java.util.regex.Matcher;
 12 | import java.util.regex.Pattern;
 13 | 
 14 | /**
 15 |  * Created by wycm
 16 |  */
 17 | public class Music163 {
 18 |     private static Logger logger = LoggerFactory.getLogger(Music163.class);
 19 | 
 20 |     //拷贝登录成功的浏览器原始cookie
 21 |     private final static String RAW_COOKIES = "cookie1=value1; cookie2=value2";
 22 |     private final static String CHROME_DRIVER_PATH = "/Users/wangyang/Downloads/chromedriver";
 23 |     //歌曲列表id
 24 |     private static String startId = "22336453";
 25 | 
 26 | 
 27 |     private static String userId = null;
 28 |     private static Set<String> playListSet = new HashSet<>();
 29 |     private static Pattern pattern = Pattern.compile("<span class=\"j-flag time\"><em>(.*?)</em>(.*?)</span>");
 30 |     private static Pattern songName = Pattern.compile("class=\"f-thide name fc1 f-fl\" title=\"(.*?)\"");
 31 |     private static ChromeOptions chromeOptions = new ChromeOptions();
 32 |     private static WebDriver driver = null;
 33 |     static {
 34 |         System.setProperty("webdriver.chrome.driver", CHROME_DRIVER_PATH);
 35 |         chromeOptions.addArguments("--no-sandbox");
 36 |     }
 37 |     public static void main(String[] args) throws InterruptedException {
 38 |         while (true){
 39 |             try {
 40 |                 driver = new ChromeDriver(chromeOptions);
 41 |                 playListSet.add(startId);
 42 |                 invoke();
 43 |             } catch (Exception e){
 44 |                 logger.error(e.getMessage(), e);
 45 |             } finally {
 46 |                 driver.quit();
 47 |             }
 48 |             Thread.sleep(1000 * 10);
 49 |         }
 50 |     }
 51 | 
 52 |     /**
 53 |      * 初始化cookies
 54 |      */
 55 |     private static void initCookies(){
 56 |         Arrays.stream(RAW_COOKIES.split("; ")).forEach(rawCookie -> {
 57 |             String[] ss = rawCookie.split("=");
 58 |             Cookie cookie = new Cookie.Builder(ss[0], ss[1]).domain(".163.com").build();
 59 |             driver.manage().addCookie(cookie);
 60 |         });
 61 |     }
 62 |     private static void invoke() throws InterruptedException {
 63 |         driver.manage().timeouts().implicitlyWait(5, TimeUnit.SECONDS);
 64 |         driver.manage().timeouts().pageLoadTimeout(15, TimeUnit.SECONDS);
 65 |         String s = null;
 66 |         driver.get("http://music.163.com/");
 67 |         initCookies();
 68 |         driver.get("http://music.163.com/");
 69 |         s = driver.getPageSource();
 70 |         userId = group(s, "userId:(\\d+)", 1);
 71 |         driver.get("https://music.163.com/#/playlist?id=" + startId);
 72 |         driver.switchTo().frame("contentFrame");
 73 |         WebElement element = driver.findElement(By.cssSelector("[id=content-operation]>a:first-child"));
 74 |         element.click();
 75 |         ((JavascriptExecutor) driver).executeScript("window.open('about:blank')");
 76 |         ArrayList<String> tabs = new ArrayList<String>(driver.getWindowHandles());
 77 |         driver.switchTo().window(tabs.get(0));
 78 |         driver.switchTo().defaultContent();
 79 |         int i = 0;
 80 |         String lastSongName = "";
 81 |         int count = 0;
 82 |         while (true){
 83 |             if(i > Integer.MAX_VALUE - 2){
 84 |                 break;
 85 |             }
 86 |             i++;
 87 |             s = driver.getPageSource();
 88 |             driver.switchTo().window(tabs.get(1)); //switches to new tab
 89 |             String songs = null;
 90 |             try{
 91 |                 driver.get("https://music.163.com/user/home?id=" + userId);
 92 |                 driver.switchTo().frame("contentFrame");
 93 |                 songs = group(driver.getPageSource(), "累积听歌(\\d+)首", 1);
 94 |             } catch (TimeoutException e){
 95 |                 logger.error(e.getMessage(), e);
 96 |             }
 97 |             driver.switchTo().window(tabs.get(0));
 98 |             Matcher matcher = pattern.matcher(s);
 99 |             Matcher songNameMatcher = songName.matcher(s);
100 |             if (matcher.find() && songNameMatcher.find()){
101 |                 String songNameStr = songNameMatcher.group(1);
102 |                 if (!songNameStr.equals(lastSongName)){
103 |                     count++;
104 |                     lastSongName = songNameStr;
105 |                 }
106 |                 logger.info(songNameStr + "-" + matcher.group(1) + matcher.group(2) + "---当前播放第" + count + "首歌曲, 累计听歌:" + songs);
107 |             } else {
108 |                 logger.info("解析歌曲播放记录或歌曲名失败");
109 |             }
110 |             Thread.sleep(1000 * 30);
111 |         }
112 |     }
113 |     public static String group(String str, String regex, int index) {
114 |         Pattern pattern = Pattern.compile(regex);
115 |         Matcher matcher = pattern.matcher(str);
116 |         return matcher.find() ? matcher.group(index) : "";
117 |     }
118 | }
119 | 


--------------------------------------------------------------------------------
/www.goubanjia.com/README.md:
--------------------------------------------------------------------------------
  1 | ## 摘要
  2 | * 上一篇以知乎网为例简单分享网络请求分析。这一篇主要分享一种应对反爬虫的方法，前端数据混淆。
  3 | ## 目的
  4 | * 之前写https://github.com/wycm/zhihu-crawler项目的时候，需要用到免费的http代理，然后找到了这个 http://www.goubanjia.com/ 这个网站。现在需要把这个网站上的ip和port爬取下来，有兴趣的朋友也可以尝试自己爬取一下。
  5 | ## 开始
  6 | * 打开这个网站首页，然后控制台查看ip和port的对应标签。
  7 | ![](http://upload-images.jianshu.io/upload_images/5830895-166662036a68a8ac.png?imageMogr2/auto-orient/strip%7CimageView2/2/w/1240)
  8 | * 如上图（图一），从控制台的标签中可以看出ip加了一些无关不显示的标签来混淆数据，这里混淆的原理其实很简单，通过标签的style="display:none"属性来达到混淆的目的，也就是包含这个属性的标签是不会显示在页面上的。知道了这一点就比较好处理了，只需要在解析的时候把包含style="display:none"属性的标签去掉。就可以轻松的拿到ip和port数据了。
  9 | * 代码如下
 10 | ```
 11 | package com.cnblogs.wycm;
 12 | 
 13 | import org.jsoup.Jsoup;
 14 | import org.jsoup.nodes.Document;
 15 | import org.jsoup.nodes.Element;
 16 | import org.jsoup.select.Elements;
 17 | import java.io.IOException;
 18 | import java.net.URL;
 19 | 
 20 | /**
 21 |  *
 22 |  * 数据的解析采用的是Jsoup框架，Jsoup是一个操作HTML标签的Java库，它提供了非常方便的API来提取和操纵库，支持类似jquery的选择器来查找标签。
 23 |  * 由于请求比较单一，这里的网络请求并没有采用上一篇所使用HttpClient框架。直接通过Jsoup来执行http请求的。
 24 |  * 关于Jsoup的使用可以参考http://www.open-open.com/jsoup/
 25 |  *
 26 |  */
 27 | public class Chapter1 {
 28 |     public static void main(String[] args) throws IOException {
 29 |         Document document= Jsoup.parse(new URL("http://www.goubanjia.com/"), 10000);
 30 |         //获取class='table'的table的所有子节点tr
 31 |         Elements elements = document.select("table[class=table] tr");
 32 |         for (int i = 1; i < elements.size(); i++){
 33 |             //获取td节点
 34 |             Element td = elements.get(i).select("td").first();
 35 |             /**
 36 |              * 查找所有style属性包含none字符串的标签（页面上未显示的标签），并移除
 37 |              * 包括以下两种
 38 |              * style=display: none;
 39 |              * style=display:none;
 40 |              */
 41 |             for(Element none : td.select("[style*=none;]")){
 42 |                 none.remove();
 43 |             }
 44 |             //移除空格
 45 |             String ipPort = td.text().replaceAll(" ", "");
 46 |             //打印
 47 |             System.out.println(ipPort);
 48 |         }
 49 |     }
 50 | }
 51 | /*
 52 | 第一次运行打印结果:
 53 | 183.129.246.228:8132
 54 | 222.92.136.206:8987
 55 | 54.238.186.100:8988
 56 | ...
 57 | 第二次运行打印结果：
 58 | 183.129.246.228:8377
 59 | 222.92.136.206:9059
 60 | 54.238.186.100:8622
 61 | ...
 62 | */
 63 | ```
 64 | * ip地址能够准确的拿到了，却发现port被做了混淆，而且每次返回的port还在动态改变。大家可以通过把浏览器的JavaScrip脚本关闭后，然后刷新这个网页。会发现每次的port都不一样。我们每次看到的正确port都是通过JavaScript脚本处理后的。如果采用普通爬虫的方式拿到的port都是错误的。现在要想拿到正确的port，可以通过分析它JavaScrip脚本还原数据的逻辑。
 65 | * 同样打开控制台->选择Sources->选择一行js代码打断点（点击行编号），如下图
 66 | ![](http://upload-images.jianshu.io/upload_images/5830895-aa150ab7dee00d09.png?imageMogr2/auto-orient/strip%7CimageView2/2/w/1240)
 67 | * 刷新网页—>页面Paused in debugger—>选择Elements->右键td节点->Break on...->subtree modifications。这两个步骤就是在设置断点调试，也就是在td节点发生改变的时候paused。
 68 | ![](http://upload-images.jianshu.io/upload_images/5830895-17771b34ebc43f3c.png?imageMogr2/auto-orient/strip%7CimageView2/2/w/1240)
 69 | * 选择Sources->F8(继续执行)，这个时候又会有一次pause，也就是js脚本在还原正确port的时候（如下图）
 70 | ![](http://upload-images.jianshu.io/upload_images/5830895-ad17fd4ba7733441.png?imageMogr2/auto-orient/strip%7CimageView2/2/w/1240)
 71 | * 函数的调用栈有好多层，如何快速定位哪一个函数的技巧就是，看它局部变量表的变量变化，因为这里是port在发生改变，然后找到对应变量和对应逻辑函数。简单分析可以确定到port发生改变的函数是一个匿名函数，如下图
 72 | ![](http://upload-images.jianshu.io/upload_images/5830895-55a376f91fee8519.png?imageMogr2/auto-orient/strip%7CimageView2/2/w/1240)
 73 | * 格式化后，代码如下：
 74 | ```
 75 | var _$ = ['\x2e\x70\x6f\x72\x74', "\x65\x61\x63\x68", "\x68\x74\x6d\x6c", "\x69\x6e\x64\x65\x78\x4f\x66", '\x2a', "\x61\x74\x74\x72", '\x63\x6c\x61\x73\x73', "\x73\x70\x6c\x69\x74", "\x20", "", "\x6c\x65\x6e\x67\x74\x68", "\x70\x75\x73\x68", '\x41\x42\x43\x44\x45\x46\x47\x48\x49\x5a', "\x70\x61\x72\x73\x65\x49\x6e\x74", "\x6a\x6f\x69\x6e", ''];
 76 | $(function() {
 77 |     $(_$[0])[_$[1]](function() {
 78 |         var a = $(this)[_$[2]]();
 79 |         if (a[_$[3]](_$[4]) != -0x1) {
 80 |             return
 81 |         }
 82 |         ;var b = $(this)[_$[5]](_$[6]);
 83 |         try {
 84 |             b = (b[_$[7]](_$[8]))[0x1];
 85 |             var c = b[_$[7]](_$[9]);
 86 |             var d = c[_$[10]];
 87 |             var f = [];
 88 |             for (var g = 0x0; g < d; g++) {
 89 |                 f[_$[11]](_$[12][_$[3]](c[g]))
 90 |             }
 91 |             ;$(this)[_$[2]](window[_$[13]](f[_$[14]](_$[15])) >> 0x3)
 92 |         } catch (e) {}
 93 |     })
 94 | })
 95 | ```
 96 | * 还原后如下：
 97 | ```
 98 | var _$ = ['.port', "each", "html", "indexOf", '*', "attr", 'class', "split", " ", "", "length", "push", 'ABCDEFGHIZ', "parseInt", "join", ''];
 99 | $(function() {
100 |     $('.port').each(function() {
101 |         var a = $(this).html();
102 |         if (a.indexOf('*') != -0x1) {
103 |             return
104 |         }
105 |         ;var b = $(this).attr('class');
106 |         try {
107 |             b = (b.split(" "))[0x1];
108 |             var c = b.split("");
109 |             var d = c.length;
110 |             var f = [];
111 |             for (var g = 0x0; g < d; g++) {
112 |                 f.push('ABCDEFGHIZ'.indexOf(c[g]))
113 |             }
114 |             ;$(this).html(window.parseInt(f.join('')) >> 0x3)
115 |         } catch (e) {}
116 |     })
117 | })
118 | ```
119 | * 这段代码的逻辑，获取port标签的class属性值，取出属性中后面的几个大写字母，遍历该字符串，找出每次字符在'ABCDEFGHIZ'这个字符串中的索引，然后parseInt转换为整数，然后进行右移3位的操作。
120 | * 完整代码实现
121 | ```
122 | package com.cnblogs.wycm;
123 | 
124 | import org.jsoup.Jsoup;
125 | import org.jsoup.nodes.Document;
126 | import org.jsoup.nodes.Element;
127 | import org.jsoup.select.Elements;
128 | 
129 | import java.io.IOException;
130 | import java.net.URL;
131 | 
132 | public class Chapter2 {
133 |     public static void main(String[] args) throws IOException {
134 |         Document document= Jsoup.parse(new URL("http://www.goubanjia.com/"), 10000);
135 |         setPort(document);
136 |         //获取class='table'的table的所有子节点tr
137 |         Elements elements = document.select("table[class=table] tr");
138 |         for (int i = 1; i < elements.size(); i++){
139 |             //获取td节点
140 |             Element td = elements.get(i).select("td").first();
141 |             /**
142 |              * 查找所有style属性包含none字符串的标签（页面上未显示的标签），并移除
143 |              * 包括以下两种
144 |              * style=display: none;
145 |              * style=display:none;
146 |              */
147 |             for(Element none : td.select("[style*=none;]")){
148 |                 none.remove();
149 |             }
150 |             //移除空格
151 |             String ipPort = td.text().replaceAll(" ", "");
152 |             //打印
153 |             System.out.println(ipPort);
154 |         }
155 |     }
156 | 
157 |     /**
158 |      * js代码port还原
159 |      * @param doc
160 |      */
161 |     private static void setPort(Document doc){
162 |         for (Element e : doc.select(".port")){//$('.port').each(function() {
163 |             String a = e.text();//var a = $(this).html();
164 |             if(a.indexOf("*") != -0x1){//if (a.indexOf('*') != -0x1) {
165 |                 return;
166 |             }
167 |             String b = e.attr("class");//var b = $(this).attr('class');
168 |             b = b.split(" ")[0x1];//b = (b.split(" "))[0x1];
169 |             String[] c = b.split("");//var c = b.split("");
170 |             int d = b.length();//var d = c.length;
171 |             StringBuilder f = new StringBuilder();//var f = [];
172 |             for(int g = 0x0; g < d; g++){//for (var g = 0x0; g < d; g++) {
173 |                 f.append("ABCDEFGHIZ".indexOf(c[g]));//f.push('ABCDEFGHIZ'.indexOf(c[g]))
174 |             }
175 |             e.text(String.valueOf(Integer.valueOf(f.toString()) >> 0x3));//$(this).html(window.parseInt(f.join('')) >> 0x3)
176 |         }
177 |     }
178 | }
179 | ```
180 | * maven依赖
181 | ```
182 |  <dependency>
183 |         <groupId>org.jsoup</groupId>
184 |         <artifactId>jsoup</artifactId>
185 |         <version>1.10.2</version>
186 |  </dependency>
187 | ```
188 | ## 总结
189 | * 该篇文章简单分项了下如何应对前端混淆的反爬虫。关于这种反爬虫，还有其它的一些应对方式。如采用无头浏览器的方式，比如phantomjs框架。这种无头浏览器原本是用来做自动化测试的。它是基于webkit内核的，所以它可以较容易的爬取这种前端混淆的这种网站。一般来说浏览器能够正常访问到的数据，这种方式也可以比较容易爬取这些数据。当然这种方式的最大问题就是效率比较低。因为这种方式它每加载一个页面，都需要下载它的附加资源，如js脚本，脚本下载完成后，还要去执行js脚本。
190 | * 我这里采用的方式是阅读js代码，得出前端混淆的逻辑，然后再通过目标语言来实现对应逻辑。这种方式如果针对一些简单的加密混淆还是很有用的。但是当遇到一些大型复杂的网站，如百度、微博等，需要抓取登录后的数据。这时候需要来手动模拟登录，相对来说，这种网站的模拟登录会更复杂，找各种登录参数来源。都会耗费大量精力。分析请求的成本会比较高。这种方式的优点就是爬取速度快，只获取目标数据。不需要额外网络请求成本。
191 | 
192 | >![](http://upload-images.jianshu.io/upload_images/5830895-6a8b96dde229c26c.png?imageMogr2/auto-orient/strip%7CimageView2/2/w/1240)
193 | <br>一个程序员日常分享，包括但不限于爬虫、Java后端技术，欢迎关注。


--------------------------------------------------------------------------------
/weibo/src/main/java/com/github/wycm/HttpClientUtil.java:
--------------------------------------------------------------------------------
  1 | package com.github.wycm;
  2 | 
  3 | import org.apache.http.*;
  4 | import org.apache.http.client.CookieStore;
  5 | import org.apache.http.client.HttpRequestRetryHandler;
  6 | import org.apache.http.client.config.CookieSpecs;
  7 | import org.apache.http.client.config.RequestConfig;
  8 | import org.apache.http.client.entity.UrlEncodedFormEntity;
  9 | import org.apache.http.client.methods.CloseableHttpResponse;
 10 | import org.apache.http.client.methods.HttpGet;
 11 | import org.apache.http.client.methods.HttpPost;
 12 | import org.apache.http.client.methods.HttpRequestBase;
 13 | import org.apache.http.client.protocol.HttpClientContext;
 14 | import org.apache.http.config.ConnectionConfig;
 15 | import org.apache.http.config.Registry;
 16 | import org.apache.http.config.RegistryBuilder;
 17 | import org.apache.http.config.SocketConfig;
 18 | import org.apache.http.conn.ConnectTimeoutException;
 19 | import org.apache.http.conn.socket.ConnectionSocketFactory;
 20 | import org.apache.http.conn.socket.PlainConnectionSocketFactory;
 21 | import org.apache.http.conn.ssl.SSLConnectionSocketFactory;
 22 | import org.apache.http.conn.ssl.TrustStrategy;
 23 | import org.apache.http.cookie.Cookie;
 24 | import org.apache.http.impl.client.*;
 25 | import org.apache.http.impl.conn.DefaultProxyRoutePlanner;
 26 | import org.apache.http.impl.conn.PoolingHttpClientConnectionManager;
 27 | import org.apache.http.message.BasicNameValuePair;
 28 | import org.apache.http.protocol.HttpContext;
 29 | import org.apache.http.ssl.SSLContexts;
 30 | import org.apache.log4j.Logger;
 31 | 
 32 | import javax.net.ssl.SSLContext;
 33 | import javax.net.ssl.SSLException;
 34 | import java.io.*;
 35 | import java.net.UnknownHostException;
 36 | import java.nio.charset.CodingErrorAction;
 37 | import java.security.KeyStore;
 38 | import java.security.cert.CertificateException;
 39 | import java.security.cert.X509Certificate;
 40 | import java.util.ArrayList;
 41 | import java.util.List;
 42 | import java.util.Map;
 43 | 
 44 | /**
 45 |  * HttpClient工具类
 46 |  */
 47 | public class HttpClientUtil {
 48 | 	private static Logger logger = Logger.getLogger(HttpClientUtil.class);
 49 | 	private static CloseableHttpClient httpClient;
 50 | 	private final static HttpClientContext httpClientContext = HttpClientContext.create();
 51 | 	private final static String userAgent = "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; QQDownload 1.7; .NET CLR 1.1.4322; CIBA; .NET CLR 2.0.50727)";
 52 | 	private static HttpHost proxy;
 53 | 	private static RequestConfig requestConfig;
 54 | 	static {
 55 | 		init();
 56 | 	}
 57 | 	private static void init() {
 58 |         try {
 59 |             SSLContext sslContext =
 60 |                     SSLContexts.custom()
 61 |                             .loadTrustMaterial(KeyStore.getInstance(KeyStore.getDefaultType()), new TrustStrategy() {
 62 |                                 @Override
 63 |                                 public boolean isTrusted(X509Certificate[] chain, String authType)
 64 |                                         throws CertificateException {
 65 |                                     return true;
 66 |                                 }
 67 |                             }).build();
 68 |             SSLConnectionSocketFactory sslSFactory =
 69 |                     new SSLConnectionSocketFactory(sslContext);
 70 |             Registry<ConnectionSocketFactory> socketFactoryRegistry =
 71 |                     RegistryBuilder.<ConnectionSocketFactory>create()
 72 |                             .register("http", PlainConnectionSocketFactory.INSTANCE).register("https", sslSFactory)
 73 |                             .build();
 74 | 
 75 |             PoolingHttpClientConnectionManager connManager =
 76 |                     new PoolingHttpClientConnectionManager(socketFactoryRegistry);
 77 | 
 78 |             SocketConfig socketConfig = SocketConfig.custom().setTcpNoDelay(true).build();
 79 |             connManager.setDefaultSocketConfig(socketConfig);
 80 | 
 81 |             ConnectionConfig connectionConfig =
 82 |                     ConnectionConfig.custom().setMalformedInputAction(CodingErrorAction.IGNORE)
 83 |                             .setUnmappableInputAction(CodingErrorAction.IGNORE).setCharset(Consts.UTF_8).build();
 84 |             connManager.setDefaultConnectionConfig(connectionConfig);
 85 |             connManager.setMaxTotal(300);
 86 |             connManager.setDefaultMaxPerRoute(100);
 87 | 
 88 |             HttpRequestRetryHandler retryHandler = new HttpRequestRetryHandler() {
 89 |                 @Override
 90 |                 public boolean retryRequest(IOException exception, int executionCount, HttpContext context) {
 91 |                     if (executionCount > 0) {
 92 |                         return false;
 93 |                     }
 94 |                     if (exception instanceof InterruptedIOException) {
 95 |                         return true;
 96 |                     }
 97 |                     if (exception instanceof ConnectTimeoutException) {
 98 |                         return true;
 99 |                     }
100 |                     if (exception instanceof UnknownHostException) {
101 |                         return true;
102 |                     }
103 |                     if (exception instanceof SSLException) {
104 |                         return true;
105 |                     }
106 |                     HttpRequest request = HttpClientContext.adapt(context).getRequest();
107 |                     if (!(request instanceof HttpEntityEnclosingRequest)) {
108 |                         return true;
109 |                     }
110 |                     return false;
111 |                 }
112 |             };
113 | 
114 |             HttpClientBuilder httpClientBuilder =
115 |                     HttpClients.custom().setConnectionManager(connManager).setRetryHandler(retryHandler)
116 | 							//设置post默认重定向
117 | 							.setRedirectStrategy(new LaxRedirectStrategy())
118 |                             .setDefaultCookieStore(new BasicCookieStore()).setUserAgent(userAgent);
119 |             if (proxy != null) {
120 |                 httpClientBuilder.setRoutePlanner(new DefaultProxyRoutePlanner(proxy)).build();
121 |             }
122 |             httpClient = httpClientBuilder.build();
123 | 
124 |             requestConfig = RequestConfig.custom().setSocketTimeout(10000).
125 | 					setConnectTimeout(10000).
126 | 					setConnectionRequestTimeout(10000).
127 | 					setCookieSpec(CookieSpecs.STANDARD).
128 | 					build();
129 |         } catch (Exception e) {
130 |             logger.error(e.getMessage());
131 |         }
132 |     }
133 |     public static String get(String url) throws IOException {
134 |     	HttpGet request = new HttpGet(url);
135 |     	return getWebPage(request, null, "UTF-8", false);
136 | 	}
137 | 	public static String get(HttpRequestBase request, RequestConfig config) throws IOException {
138 | 		return getWebPage(request, config, "UTF-8", false);
139 | 	}
140 | 	public static String getWebPage(HttpRequestBase request) throws IOException {
141 | 		return getWebPage(request, null, "UTF-8", false);
142 | 	}
143 | 	public static String getWebPage(HttpRequestBase request, RequestConfig config) throws IOException {
144 | 		return getWebPage(request, config, "UTF-8", false);
145 | 	}
146 | 	/**
147 | 	 *
148 | 	 * @param request 请求
149 | 	 * @param encoding 字符编码
150 | 	 * @param isPrintConsole 是否打印到控制台
151 |      * @return 网页内容
152 |      */
153 | 	public static String getWebPage(HttpRequestBase request,
154 |                                     RequestConfig config,
155 |                                     String encoding,
156 |                                     boolean isPrintConsole) throws IOException {
157 | 		CloseableHttpResponse response = null;
158 | 		if (config != null){
159 | 			request.setConfig(config);
160 | 		}
161 | 		else {
162 | 			request.setConfig(requestConfig);
163 | 		}
164 | 		response = httpClient.execute(request, httpClientContext);
165 | 		logger.info("status---" + response.getStatusLine().getStatusCode());
166 | 		BufferedReader rd = null;
167 | 		StringBuilder webPage = null;
168 | 		try {
169 | 			rd = new BufferedReader(
170 |                     new InputStreamReader(response.getEntity().getContent(),encoding));
171 | 			String line = "";
172 | 			webPage = new StringBuilder();
173 | 			while((line = rd.readLine()) != null) {
174 | 				webPage.append(line);
175 | 				if(isPrintConsole){
176 | 					System.out.println(line);
177 | 				}
178 | 			}
179 | 		} catch (IOException e) {
180 | 			e.printStackTrace();
181 | 		}
182 | 		request.releaseConnection();
183 | 		response.close();
184 | 		return webPage.toString();
185 | 	}
186 | 	/**
187 | 	 * 设置request请求参数
188 | 	 * @param request
189 | 	 * @param params
190 |      */
191 | 	public static void setHttpPostParams(HttpPost request,Map<String,String> params) throws UnsupportedEncodingException {
192 | 		List<NameValuePair> formParams = new ArrayList<NameValuePair>();
193 | 		for (String key : params.keySet()) {
194 | 			formParams.add(new BasicNameValuePair(key,params.get(key)));
195 | 		}
196 | 		UrlEncodedFormEntity entity = new UrlEncodedFormEntity(formParams, "utf-8");
197 | 		request.setEntity(entity);
198 | 	}
199 | }
200 | 


--------------------------------------------------------------------------------
/www.dianping.com/src/test/resources/test.html:
--------------------------------------------------------------------------------
 1 | 
 2 | <?xml version="1.0" encoding="UTF-8" standalone="no"?>
 3 | <!DOCTYPE svg PUBLIC "-//W3C//DTD SVG 1.1//EN" "http://www.w3.org/Graphics/SVG/1.1/DTD/svg11.dtd">
 4 | <svg xmlns="http://www.w3.org/2000/svg" version="1.1" xmlns:xlink="http://www.w3.org/1999/xlink" width="600px" height="2310.0px">
 5 |     <style>.textStyle {font-family:PingFangSC-Regular,Microsoft YaHei,'Hiragino Sans GB',Helvetica;font-size:14px;fill:#282828;}</style>
 6 |     <defs><path id="1" d="M0 30 H600"/><path id="2" d="M0 60 H600"/><path id="3" d="M0 90 H600"/><path id="4" d="M0 120 H600"/><path id="5" d="M0 150 H600"/><path id="6" d="M0 180 H600"/><path id="7" d="M0 210 H600"/><path id="8" d="M0 240 H600"/><path id="9" d="M0 270 H600"/><path id="10" d="M0 300 H600"/><path id="11" d="M0 330 H600"/><path id="12" d="M0 360 H600"/><path id="13" d="M0 390 H600"/><path id="14" d="M0 420 H600"/><path id="15" d="M0 450 H600"/><path id="16" d="M0 480 H600"/><path id="17" d="M0 510 H600"/><path id="18" d="M0 540 H600"/><path id="19" d="M0 570 H600"/><path id="20" d="M0 600 H600"/><path id="21" d="M0 630 H600"/><path id="22" d="M0 660 H600"/><path id="23" d="M0 690 H600"/><path id="24" d="M0 720 H600"/><path id="25" d="M0 750 H600"/><path id="26" d="M0 780 H600"/><path id="27" d="M0 810 H600"/><path id="28" d="M0 840 H600"/><path id="29" d="M0 870 H600"/><path id="30" d="M0 900 H600"/><path id="31" d="M0 930 H600"/><path id="32" d="M0 960 H600"/><path id="33" d="M0 990 H600"/><path id="34" d="M0 1020 H600"/><path id="35" d="M0 1050 H600"/><path id="36" d="M0 1080 H600"/><path id="37" d="M0 1110 H600"/><path id="38" d="M0 1140 H600"/><path id="39" d="M0 1170 H600"/><path id="40" d="M0 1200 H600"/><path id="41" d="M0 1230 H600"/><path id="42" d="M0 1260 H600"/><path id="43" d="M0 1290 H600"/><path id="44" d="M0 1320 H600"/><path id="45" d="M0 1350 H600"/><path id="46" d="M0 1380 H600"/><path id="47" d="M0 1410 H600"/><path id="48" d="M0 1440 H600"/><path id="49" d="M0 1470 H600"/><path id="50" d="M0 1500 H600"/><path id="51" d="M0 1530 H600"/><path id="52" d="M0 1560 H600"/><path id="53" d="M0 1590 H600"/><path id="54" d="M0 1620 H600"/><path id="55" d="M0 1650 H600"/><path id="56" d="M0 1680 H600"/><path id="57" d="M0 1710 H600"/><path id="58" d="M0 1740 H600"/><path id="59" d="M0 1770 H600"/><path id="60" d="M0 1800 H600"/><path id="61" d="M0 1830 H600"/><path id="62" d="M0 1860 H600"/><path id="63" d="M0 1890 H600"/><path id="64" d="M0 1920 H600"/><path id="65" d="M0 1950 H600"/><path id="66" d="M0 1980 H600"/><path id="67" d="M0 2010 H600"/><path id="68" d="M0 2040 H600"/><path id="69" d="M0 2070 H600"/><path id="70" d="M0 2100 H600"/><path id="71" d="M0 2130 H600"/><path id="72" d="M0 2160 H600"/><path id="73" d="M0 2190 H600"/><path id="74" d="M0 2220 H600"/><path id="75" d="M0 2250 H600"/></defs>
 7 |     <text class="textStyle" lengthAdjust="spacing">
 8 |         <textPath xlink:href="#1" textLength="574">洗冈五辅结葱练嫁犯政外素永闯词聚赤粥咱帆反烈舞悬航西融芦跳唐截户泻阶陕厕宁拐久闭毁</textPath>
 9 |         <textPath xlink:href="#2" textLength="532">议勾昆构败捐介厅叙樱奥膜单炼响古各饶愧泼凳铅孙究省灭艰穿坐肾蚂编烟珍膝蒸看劝</textPath>
10 |         <textPath xlink:href="#3" textLength="448">较雕批虚狮沟乳嘴滚港精嗽垒盯建我贯肚准斑庸勉闷贝侄挣鸽略佛证领灰</textPath>
11 |         <textPath xlink:href="#4" textLength="420">遮欣沃仅器挪崭冰胡事站乓曾恐区里愤苏司倒疤丑合谜呼浑纳广劳脆</textPath>
12 |         <textPath xlink:href="#5" textLength="588">它枕位独鹿糖畅标宅静占沫劫宜驳辽钞清酬策抢犁王村粗玉喜篮蝶仿金衫腿馆搁影埋胶维龄靠腊</textPath>
13 |         <textPath xlink:href="#6" textLength="588">摧蚀嫌云官致易雨傲鸟吊样杯窃迎伍棵坚爱目唱向曲帖否劈求邮潜灶胆奖柔毕鸭仰胸州山晋插令</textPath>
14 |         <textPath xlink:href="#7" textLength="322">卵近刷绿几屈止件牌捏兼愁趴纱篇烫正霜声倘眨谷株</textPath>
15 |         <textPath xlink:href="#8" textLength="532">扇攻醋茎哑口庙整熔杂艺拿吧辣中减都似愿虎仁槐驻程寿杜忆岸交键污华科帜宿故潮乱</textPath>
16 |         <textPath xlink:href="#9" textLength="574">榴摆冬架稠线课旧稿盘暖漂薪协坛午最信驰轧坏溪缝瓦奋姑扎当僻殃康昏直夕炕害叔歉络着遍</textPath>
17 |         <textPath xlink:href="#10" textLength="350">跌值袭抛贵水脸尊救恢辩匠柜雪伯充悼葡裳笋蹄钉核庆竞</textPath>
18 |         <textPath xlink:href="#11" textLength="294">边御柄韵昼屿镇再码达幸槽配百酷瞒叉宣揭溉尝</textPath>
19 |         <textPath xlink:href="#12" textLength="392">弱屋播数哲亏闲晌提规导怪顺描著往花秤啄望贪油惭买椒绵舟住</textPath>
20 |         <textPath xlink:href="#13" textLength="560">遗茂回察贷引刀晃梅咏秩济色级放礼始穗质佳筒竭弃佣草天夜浸暂蜘盐降圆板晓讨赞猪召益</textPath>
21 |         <textPath xlink:href="#14" textLength="546">室鹅输纤布药第痕尸钩渐估很厘枣野妈池阻费姜陆讯土朽叠方浓勇智入氧桥酸塘改腥盟被</textPath>
22 |         <textPath xlink:href="#15" textLength="378">姥葛模轨戚励揪鸣芝赶身秧旱群爆族吐怀南电婚搭兆呢越爪重</textPath>
23 |         <textPath xlink:href="#16" textLength="378">丧伟则技快恩浊涉步峰圾思惨孝举赏逢裁驾滴涝旗羞暗煮石蔬</textPath>
24 |         <textPath xlink:href="#17" textLength="476">街活掘莲蔽剥懒拢舍优律菠社井演兄趣产德文萌乙替昨瘦剪匹匪挨振舌圈喉但</textPath>
25 |         <textPath xlink:href="#18" textLength="434">银秃畜柏宽籍僚贿衬辞砍补圣市浅粒妨丈习橘子那侧力奔软继廉盼箭仓</textPath>
26 |         <textPath xlink:href="#19" textLength="518">死惜熄甘骤搏爽吓算效红逃凶芒摇语什罗秒要钳之血干欺哄捡榆瓶笨唤滋皱羽映煎风</textPath>
27 |         <textPath xlink:href="#20" textLength="392">物麦典洞隙像坟含箱而芬面森拦沸衡台括与她匙监硬誓心镰掌伞</textPath>
28 |         <textPath xlink:href="#21" textLength="476">筹岭况剑搂护殿佩块翠工一索渠倡此笼侨想热阳骄并腐全间妖锹属植捧探童蜂</textPath>
29 |         <textPath xlink:href="#22" textLength="588">贤皇刮趟进贞绢松沾飘奉见呈赛射找蔑姿铲触橡贡扶查绪掩赢震传闪廊粮者抵咬趋哨种翅裤棍堪</textPath>
30 |         <textPath xlink:href="#23" textLength="574">燥折舅痒环性另蹦纲甲原巧助猜堤泪酿券蚁免鬼择辉蝴高谢摔杏两说喝击约嫂守宇案俘渣喇酱</textPath>
31 |         <textPath xlink:href="#24" textLength="574">磁吩唇喷行意倦出囊虫默无嚼副乞缩险新杀纸葵跑肠孟俭诵毅屡章违挂奸听拘傻母乐根半其功</textPath>
32 |         <textPath xlink:href="#25" textLength="504">左够惠问栏时谦翁待诱离积钥项授量驱设晚脚罪灾榜犹馋谎九吗斤津驶炒锐怜霸咐</textPath>
33 |         <textPath xlink:href="#26" textLength="420">实乌迈妄劲赵茶落定颈审宝叶禁虹淡巷渡某督览叫喊伐狸筝支蛇苗迁</textPath>
34 |         <textPath xlink:href="#27" textLength="546">魂话尘后眯老绩发烤抄魔晶蚊拉火校薄盗斯肉临垮睡至陷紧威期争怨取塔版爹订狼奴辜记</textPath>
35 |         <textPath xlink:href="#28" textLength="420">扩统售堂延忘本材务洒繁弊刊衰短豆嗓店福生众将阁图的等币番卖伸</textPath>
36 |         <textPath xlink:href="#29" textLength="546">脑秆栗才旁葬份县疯冶个躬舱冒钓敏凤系巾薯芳仪泽裹床牲句狡踢陈汁液狗渔撇依洲娱拣</textPath>
37 |         <textPath xlink:href="#30" textLength="448">桐铃己苹搬呆泥甜羡碎林贺织岗毯蜓涌碰燃家铜矩献解早下没斧驴秘验态</textPath>
38 |         <textPath xlink:href="#31" textLength="322">和墓馒睁塑疑计雁哥隐盲暮闸砖凑淋斜控徒末及慢货</textPath>
39 |         <textPath xlink:href="#32" textLength="448">辨阀盾嘉逼懂么惰长丽递扑屠捉袍休臣登余暑殊届茄臭善升农八棋征敢躲</textPath>
40 |         <textPath xlink:href="#33" textLength="308">增顾友汗公歇马旋铸波绍泰悄军骑赔娘饱彼爬绳法</textPath>
41 |         <textPath xlink:href="#34" textLength="378">踪就隶徐候味简士抽董空拔藏突散茧跨涛腾洪企肢颂璃刑恳米</textPath>
42 |         <textPath xlink:href="#35" textLength="504">需抱狱甩盈貌纽绸俩字谊膊亩借甚垦愈脱枪节造辆锯烦鞠缓职压女骆毒罐悟如餐轻</textPath>
43 |         <textPath xlink:href="#36" textLength="490">限枯残题修楼乎脉卷叮呀卡箩世役便雅健柱漏脏蓄打鹊鞋摸太拨壳歌存暴宏枝兽</textPath>
44 |         <textPath xlink:href="#37" textLength="476">稻恨写朋偶坡吉聋治封们艳气互洁绕巨商民蛋忌手局耽添执可纪丛逆罩粱施款</textPath>
45 |         <textPath xlink:href="#38" textLength="448">丙备负狭柿滨郑移汇氏齿寸横踩堆除粪所捞扬尽喂煤你阔惧崖围分型扒例</textPath>
46 |         <textPath xlink:href="#39" textLength="574">填叛三档昌夺汤珠兵孤派今狂抓变浆塌录画共笑毙深序桨冲采牢托签主膀让绣欲席装忽谣慨惩</textPath>
47 |         <textPath xlink:href="#40" textLength="588">先砌腹容棚悠孕奶二址捆邻关躺岔少帽狠厦楚拌吞梦战炭吸四走贼化周术窗腰渴十参奇莫穴爷防</textPath>
48 |         <textPath xlink:href="#41" textLength="462">克诚却享椅刻端锣脊库辟情迅戴蛮缸桃匀隔抖灌鹰朱丁缎队感讲境细晨裕详</textPath>
49 |         <textPath xlink:href="#42" textLength="308">粘财幕浴列忍申杠弄亡又昂烂式悉栽考势倍妹披萍</textPath>
50 |         <textPath xlink:href="#43" textLength="420">乡愉牧混赚蜻萝完组套挠障诞船李皆闻场送洽慎俗点锋任咳猎养赴怖</textPath>
51 |         <textPath xlink:href="#44" textLength="350">胀盖框霉树轮会源层欧男稼钻抬恼渗蜜冤隆肌通肝沙陪连</textPath>
52 |         <textPath xlink:href="#45" textLength="532">矛错桑歪部虑度胜泉加川饼拴茅咸掀煌颤滩寇英掠距壮际过饿启蛙汉眉稍史湖疲判滔号</textPath>
53 |         <textPath xlink:href="#46" textLength="434">院龙失君白郊嚷厚苍幼营皂窜亚聪跪恭盆黑速逐域应尼倚转窑许也骡缺</textPath>
54 |         <textPath xlink:href="#47" textLength="420">接春毛雀置破晒若催注轿安租睬喘耳荷纷岛钟亿途哗朗伶游鼓疼卜袖</textPath>
55 |         <textPath xlink:href="#48" textLength="574">撕料净服理名给叼粉博辛坊经返到岁足冻钢锤炸鉴或蜡有灯削竟偷疗义僵杆拖元鲁拜柳巡丢蕉</textPath>
56 |         <textPath xlink:href="#49" textLength="420">哀比预胳捎吨症希东梳假玻罢受荒凡姓千虏鸡责创熊刘筑宾视焰俯琴</textPath>
57 |         <textPath xlink:href="#50" textLength="308">巩缘呜以墨牵洋网灿急观惹炊起搞瓜淘慌该斗壤饮</textPath>
58 |         <textPath xlink:href="#51" textLength="490">认骂谨唯烘窄调径侦侍府陵断鄙宪冷荐形诊遣峡孔邀菜漫额刺愚诸汽界抹删诉是</textPath>
59 |         <textPath xlink:href="#52" textLength="504">头幅摩柴座岩遥获剖复偏疆警觉示角须翻避仇臂讽厌屯小乃蚕购京棕桂浇吃办镜抚</textPath>
60 |         <textPath xlink:href="#53" textLength="560">培戒检坦答厨猛河涨掉咽损推剃帝掏牙踏勤译业日亮茫扛在饥玩好武衣制每乔招析疫孩显瞧</textPath>
61 |         <textPath xlink:href="#54" textLength="490">敲锄扭读夹锅别卧大娇携象乒季抗誉纠动纺惯棒浮怎竹亦睛同强哭膛糟拆垫忙难</textPath>
62 |         <textPath xlink:href="#55" textLength="322">儿迫自灵丘已娃路患研人邪惊月乏这耐饰锈随逮阴归</textPath>
63 |         <textPath xlink:href="#56" textLength="532">医尿扁舒异浪欢辰辱鲜包迹眠兔累阿诗袄丹雾泡割慈缴权乘请矮飞客格摄票傍梨伪陶告</textPath>
64 |         <textPath xlink:href="#57" textLength="308">沉蠢荡寻酒续景舰颜弓钱挥碌报牺肩兰漆病熟悔北</textPath>
65 |         <textPath xlink:href="#58" textLength="392">去荣拾球委训捷育展循照艘访他豪竖捕俱帐械付摊旷团驼因稀惑</textPath>
66 |         <textPath xlink:href="#59" textLength="350">螺册盏绒染筐汪敞伤剩阅蒜用青贱鼻把扮允上闹般滤满袋</textPath>
67 |         <textPath xlink:href="#60" textLength="420">脂教食握挽凉凯腔秀明晕泛朴绝虾啦撤耀段搜坑了燕泊拥年选密称堵</textPath>
68 |         <textPath xlink:href="#61" textLength="588">决初何旨烧首寒旺脾涂指识误微厉命慰园作脖予刃按蝇勺相特铁碧未专恋芹馅夫右挖冠鱼良员淹</textPath>
69 |         <textPath xlink:href="#62" textLength="462">吹胁跃鸦丸张沿幻亲门帅碍紫膨滑吴菊适碑妇岂符锦颗漠终盒疮摘国裂缠糕</textPath>
70 |         <textPath xlink:href="#63" textLength="560">宫具海垄堡绑平弹祝猫管宋膏做鞭股遭爸糠资禽代奏毫状谅极联笔痛只纵唉撞严匆蛛羊持撒</textPath>
71 |         <textPath xlink:href="#64" textLength="504">得苦志必书伏旦旅揉秋芽息于切现七姻饭光停谁悲怕眼笛历承还澡敬倾糊哈类条激</textPath>
72 |         <textPath xlink:href="#65" textLength="322">彩妻为龟祖瞎链些立朵盛骨啊郎叹页拳穷萄黄炎载挑</textPath>
73 |         <textPath xlink:href="#66" textLength="476">妥畏猾躁知江麻桶债多蹲率音剧万魄絮星叨央彻真迷袜壁践衔肆非串厂流销六</textPath>
74 |         <textPath xlink:href="#67" textLength="532">利嫩伙供收远纯确带耻巴总投道犬溜疏翼居雷城斩霞宙田亭对耍齐迟卸背逗浩贫内仆坝</textPath>
75 |         <textPath xlink:href="#68" textLength="588">瑞刚锁仍划机消危评宰崇表侵侮困赌欠寨劣筛吵夸祸尤庭梯锻范胞宗来拍斥尾绞释肥操沈帘挡底</textPath>
76 |         <textPath xlink:href="#69" textLength="518">歼嘱肤弦偿念锡顶差党贸伴梁努零更废恒慕艇屑挺猴瓣骗超桌吼磨悦雹挤房仔撑弟押</textPath>
77 |         <textPath xlink:href="#70" textLength="378">富保裙不婶碗处肃肿兴次炉贴试谈税础仙基蒙慧稳丝哪湿促据</textPath>
78 |         <textPath xlink:href="#71" textLength="476">勿庄俊饲谋宵竿援均戏攀矿牛价固私担言炮测师顽弯论虽低由追栋跟黎束排凭</textPath>
79 |         <textPath xlink:href="#72" textLength="546">蹈滥普辫剂拒蛾醉阵顿扯铺扫且胖退烛醒述姨晴罚勒温湾惕顷换蓬尖扰狐集即陡菌朝露果</textPath>
80 |         <textPath xlink:href="#73" textLength="532">痰帮品概忠蓝前姐禾疾使壶成凝常从逝焦杨梢纹怒寄遵夏趁留革丰透擦寺地婆尚班然尺</textPath>
81 |         <textPath xlink:href="#74" textLength="490">泄赠窝能肯卫片饺颠贩筋车双辈覆父旬轰既皮遇开棉搅香恰询润附耗神傅谱挎雄</textPath>
82 |         <textPath xlink:href="#75" textLength="476">榨学乖怠您拼祥体垃忧敌运鼠针肺耕胃殖仗宴木美墙扔泳绘恶赖扣杰塞储垂妙</textPath>
83 |     </text></svg>


--------------------------------------------------------------------------------
/www.dianping.com/src/main/java/com/github/wycm/DianpingCrawler.java:
--------------------------------------------------------------------------------
  1 | package com.github.wycm;
  2 | 
  3 | import org.apache.http.client.methods.HttpGet;
  4 | import org.apache.http.impl.client.CloseableHttpClient;
  5 | import org.apache.http.impl.client.HttpClients;
  6 | import org.apache.http.util.EntityUtils;
  7 | import org.jsoup.Jsoup;
  8 | import org.jsoup.nodes.Document;
  9 | import org.jsoup.nodes.Element;
 10 | 
 11 | import java.io.IOException;
 12 | import java.util.*;
 13 | import java.util.regex.Matcher;
 14 | import java.util.regex.Pattern;
 15 | import java.util.stream.Collectors;
 16 | 
 17 | /**
 18 |  * 美团点评字体反爬
 19 |  */
 20 | public class DianpingCrawler {
 21 |     public static void main(String[] args) throws IOException {
 22 |         getContent("http://www.dianping.com/shop/96231053");
 23 |     }
 24 |     private static void getContent(String detailUrl) throws IOException {
 25 |         CloseableHttpClient httpClient = HttpClients
 26 |                 .custom()
 27 |                 .setUserAgent("Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.102 Safari/537.36")
 28 |                 .build();
 29 |         String originalContent = EntityUtils.toString(httpClient.execute(new HttpGet(detailUrl)).getEntity());
 30 |         Document document= Jsoup.parse(originalContent);
 31 |         String cssUrl = "http:" + document.select("link[href*=svgtextcss]").first().attr("href");
 32 |         String cssResponse = Jsoup.connect(cssUrl).execute().body();
 33 | //        System.out.println(cssResponse);
 34 |         Pattern pattern = Pattern.compile("class\\^=\"(.*?)\".*?url\\((.*?)\\)");
 35 |         Matcher matcher = pattern.matcher(cssResponse);
 36 |         Map<String, String> urlMap = new HashMap<>();
 37 |         Map<String, String> svgMap = new HashMap<>();
 38 |         while (matcher.find()){
 39 |             String prefix = matcher.group(1);
 40 |             String url = "http:" + matcher.group(2);
 41 |             urlMap.put(prefix, url);
 42 |             svgMap.put(prefix, EntityUtils.toString(httpClient.execute(new HttpGet(url)).getEntity(), "utf-8"));
 43 |             System.out.println(prefix);
 44 |             System.out.println(url);
 45 |         }
 46 |         pattern = Pattern.compile("\\.[a-z]{2}-.*?\\{.*?\\}");
 47 |         matcher = pattern.matcher(cssResponse);
 48 |         List<CssBackground> cssList = new ArrayList<>();
 49 |         Pattern cssBackgroundPattern = Pattern.compile("(\\.([a-z]{2})-.*?)\\{background:(.*?)\\.0px (.*?)\\.0px");
 50 |         Matcher cssBackgroundMatch;
 51 |         while (matcher.find()){
 52 |             cssBackgroundMatch = cssBackgroundPattern.matcher(matcher.group(0));
 53 |             if (cssBackgroundMatch.find()){
 54 |                 cssList.add(new CssBackground(cssBackgroundMatch.group(1), Integer.valueOf(cssBackgroundMatch.group(3)), Integer.valueOf(cssBackgroundMatch.group(4))));
 55 |             }
 56 |         }
 57 |         //对css分组排序
 58 |         cssList.sort((c1, c2) ->{
 59 |             int i = c1.getClassName().substring(0, 3).compareTo(c2.getClassName().substring(0, 3));
 60 |             if (i != 0){
 61 |                 return i;
 62 |             } else {
 63 |                 i = c2.getY().compareTo(c1.getY());;
 64 |                 if (i != 0){
 65 |                     return i;
 66 |                 } else {
 67 |                     return c2.getX().compareTo(c1.getX());
 68 |                 }
 69 |             }
 70 |         });
 71 |         cssList.forEach(System.out::println);
 72 |         int xIndex = 0;
 73 |         int yIndex = 0;
 74 |         CssBackground lastCssBackground = null;
 75 |         //计算对应字体的坐标
 76 |         for(CssBackground c : cssList){
 77 |             if (lastCssBackground == null){
 78 |                 lastCssBackground = c;
 79 |                 continue;
 80 |             } else {
 81 |                 if (!c.getClassName().substring(0, 3).equals(lastCssBackground.getClassName().substring(0, 3))){
 82 |                     xIndex = 0;
 83 |                     yIndex = 0;
 84 |                 } else if (!c.getX().equals(lastCssBackground.getX()) && c.getY().equals(lastCssBackground.getY())){
 85 |                     c.setxIndex(++xIndex);
 86 |                     c.setyIndex(yIndex);
 87 |                 } else if (c.getX().equals(lastCssBackground.getX()) && !c.getY().equals(lastCssBackground.getY())){
 88 |                     c.setxIndex(xIndex);
 89 |                     c.setyIndex(++yIndex);
 90 |                 } else if (!c.getX().equals(lastCssBackground.getX()) && !c.getY().equals(lastCssBackground.getY())){
 91 |                     xIndex = 0;
 92 |                     c.setxIndex(xIndex);
 93 |                     c.setyIndex(++yIndex);
 94 |                 }
 95 |                 lastCssBackground = c;
 96 |             }
 97 |         }
 98 |         Map<String, Document> cacheDocumentMap = new HashMap<>();
 99 |         Map<String, CssBackground> cssBackgroundMap = new HashMap<>();
100 |         String lastPrefix = "";
101 |         cssList.stream().map(c -> {
102 |             c.setSvgResponse(svgMap.get(c.getClassName().substring(1, 4)));
103 |             if (!cacheDocumentMap.containsKey(c.getClassName().substring(0, 3))){
104 |                 cacheDocumentMap.put(c.getClassName().substring(0, 3), Jsoup.parse(c.getSvgResponse()));
105 |             }
106 |             c.setDocument(cacheDocumentMap.get(c.getClassName().substring(0, 3)));
107 |             Document doc = c.getDocument();
108 |             Element e = null;
109 |             if ((c.getX() == -6 && c.getY() == -6) || (c.getX() % -12 == -7 && c.getY() == -6)){
110 |                 e = doc.select("text").first();
111 |             } else if ((c.getX() == -7 && c.getY() == -7) || (c.getX() % 14 == -8 && c.getY() == -7)){
112 |                 e = doc.select("text").first();
113 |             } else if (c.getX() % 6 == -1 && c.getY() == -6){
114 |                 e = doc.select("text").first();
115 |             } else if (c.getX() % -12 == 0 && c.getY() % -30 == -6){
116 |                 e = doc.select("textPath[xlink:href='#" + (c.getyIndex() + 1) + "']").first();
117 |             } else if (c.getX() % -14 == 0 && c.getY() % -30 == -7){
118 |                 e = doc.select("textPath[xlink:href='#" + (c.getyIndex() + 1) + "']").first();
119 |             }
120 |             if (c == null){
121 |                 //为上一个
122 |                 //CssBackground{className='.hy-GijB', x=-7, y=-6, xIndex=0, yIndex=0, actualFont='null'}
123 |                 //CssBackground{className='.hy-o8Bu', x=-19, y=-6, xIndex=0, yIndex=0, actualFont='null'}
124 |                 //CssBackground{className='.hy-7IxC', x=-31, y=-6, xIndex=0, yIndex=0, actualFont='null'}
125 |                 //CssBackground{className='.hy-8zQE', x=-43, y=-6, xIndex=0, yIndex=0, actualFont='null'}
126 |                 //CssBackground{className='.hy-PrgG', x=-55, y=-6, xIndex=0, yIndex=0, actualFont='null'}
127 |                 //CssBackground{className='.hy-Qbc8', x=-67, y=-6, xIndex=0, yIndex=0, actualFont='null'}
128 |                 //CssBackground{className='.hy-TnVD', x=-79, y=-6, xIndex=0, yIndex=0, actualFont='null'}
129 |                 //CssBackground{className='.hy-TqUO', x=-91, y=-6, xIndex=0, yIndex=0, actualFont='null'}
130 |                 //CssBackground{className='.hy-UkCG', x=-103, y=-6, xIndex=0, yIndex=0, actualFont='null'}
131 |                 //CssBackground{className='.hy-yOPP', x=-114, y=-6, xIndex=0, yIndex=0, actualFont='null'}
132 |                 //todo最后一个不满足规则
133 |             }
134 |             String text = e.text();
135 |             c.setActualFont(text.substring(c.getxIndex(), c.getxIndex() + 1));
136 |             cssBackgroundMap.put(c.getClassName().substring(1, c.getClassName().length()), c);
137 |             return c;
138 |         }).collect(Collectors.toList());
139 |         //还原网页
140 |         Pattern spanPattern = Pattern.compile("<span class=\"([a-z]{2}-[A-Za-z0-9]{4})\"></span>");
141 |         Matcher contentMatcher = spanPattern.matcher(originalContent);
142 |         while (contentMatcher.find()){
143 |             String s1 = contentMatcher.group(0);
144 |             String s2 = cssBackgroundMap.get(contentMatcher.group(1)).getActualFont();
145 |             originalContent = originalContent.replace(s1, s2);
146 |         }
147 |         System.out.println(originalContent);
148 |     }
149 |     static class CssBackground{
150 |         private String className;
151 |         private Integer x;
152 |         private Integer y;
153 |         private int xIndex;
154 |         private int yIndex;
155 |         private String svgResponse;
156 |         private String actualFont;
157 |         private Document document;
158 | 
159 |         public CssBackground(String className, int x, int y) {
160 |             this.className = className;
161 |             this.x = x;
162 |             this.y = y;
163 |         }
164 | 
165 |         public String getClassName() {
166 |             return className;
167 |         }
168 | 
169 |         public void setClassName(String className) {
170 |             this.className = className;
171 |         }
172 | 
173 |         public Integer getX() {
174 |             return x;
175 |         }
176 | 
177 |         public void setX(Integer x) {
178 |             this.x = x;
179 |         }
180 | 
181 |         public Integer getY() {
182 |             return y;
183 |         }
184 | 
185 |         public void setY(Integer y) {
186 |             this.y = y;
187 |         }
188 | 
189 |         public int getxIndex() {
190 |             return xIndex;
191 |         }
192 | 
193 |         public void setxIndex(int xIndex) {
194 |             this.xIndex = xIndex;
195 |         }
196 | 
197 |         public int getyIndex() {
198 |             return yIndex;
199 |         }
200 | 
201 |         public void setyIndex(int yIndex) {
202 |             this.yIndex = yIndex;
203 |         }
204 | 
205 |         public String getSvgResponse() {
206 |             return svgResponse;
207 |         }
208 | 
209 |         public void setSvgResponse(String svgResponse) {
210 |             this.svgResponse = svgResponse;
211 |         }
212 | 
213 |         public String getActualFont() {
214 |             return actualFont;
215 |         }
216 | 
217 |         public void setActualFont(String actualFont) {
218 |             this.actualFont = actualFont;
219 |         }
220 | 
221 |         public Document getDocument() {
222 |             return document;
223 |         }
224 | 
225 |         public void setDocument(Document document) {
226 |             this.document = document;
227 |         }
228 | 
229 |         @Override
230 |         public String toString() {
231 |             return "CssBackground{" +
232 |                     "className='" + className + '\'' +
233 |                     ", x=" + x +
234 |                     ", y=" + y +
235 |                     ", xIndex=" + xIndex +
236 |                     ", yIndex=" + yIndex +
237 |                     ", actualFont='" + actualFont + '\'' +
238 |                     '}';
239 |         }
240 |     }
241 | }


--------------------------------------------------------------------------------