├── README.md ├── weibo ├── README.md ├── src │ └── main │ │ └── java │ │ └── com │ │ └── github │ │ └── wycm │ │ ├── WeiboReplyCrawler.java │ │ ├── WeiboCrawler.java │ │ └── HttpClientUtil.java └── pom.xml ├── www.zhihu.com ├── README.md ├── pom.xml └── src │ └── main │ └── java │ └── com │ └── github │ └── wycm │ └── ZhihuCrawler.java ├── music163 ├── .DS_Store ├── .gitignore ├── src │ ├── test │ │ └── java │ │ │ └── com │ │ │ └── github │ │ │ └── wycm │ │ │ └── AppTest.java │ └── main │ │ ├── resources │ │ ├── log4j-dev.properties │ │ ├── log4j-prod.properties │ │ └── log4j.properties │ │ └── java │ │ └── com │ │ └── github │ │ └── wycm │ │ └── Music163.java ├── server-auto-deploy.sh └── pom.xml ├── www.douban.com ├── README.md ├── pom.xml └── src │ └── main │ └── java │ └── com │ └── github │ └── wycm │ └── DoubanCrawler.java ├── .gitignore ├── www.dianping.com ├── README.md ├── pom.xml └── src │ ├── test │ ├── java │ │ └── com │ │ │ └── github │ │ │ └── wycm │ │ │ └── DianpingCrawlerTest.java │ └── resources │ │ └── test.html │ └── main │ └── java │ └── com │ └── github │ └── wycm │ └── DianpingCrawler.java └── www.goubanjia.com ├── pom.xml ├── src └── main │ └── java │ └── com │ └── github │ └── wycm │ └── GoubanjiaCrawler.java └── README.md /README.md: -------------------------------------------------------------------------------- 1 | crawler-set 2 | =========== 3 | * 各种网站爬虫合集,持续更新中.... -------------------------------------------------------------------------------- /weibo/README.md: -------------------------------------------------------------------------------- 1 | crawler-set 2 | =========== 3 | 4 | ## 使用selenium抓取豆瓣图书搜索动态页面 -------------------------------------------------------------------------------- /www.zhihu.com/README.md: -------------------------------------------------------------------------------- 1 | crawler-set 2 | =========== 3 | ## 通过浏览器cookie获取知乎首页的简单demo -------------------------------------------------------------------------------- /music163/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wycm/crawler-set/HEAD/music163/.DS_Store -------------------------------------------------------------------------------- /www.douban.com/README.md: -------------------------------------------------------------------------------- 1 | crawler-set 2 | =========== 3 | 4 | ## 使用selenium抓取豆瓣图书搜索动态页面 -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # idea 2 | .idea/ 3 | *.iml 4 | 5 | target/ 6 | .classpath 7 | .project 8 | .settings/ 9 | -------------------------------------------------------------------------------- /music163/.gitignore: -------------------------------------------------------------------------------- 1 | # idea 2 | .idea/ 3 | *.iml 4 | 5 | target/ 6 | .classpath 7 | .project 8 | .settings/ 9 | 10 | zhihucookies 11 | proxies 12 | zhiHuYZM.gif -------------------------------------------------------------------------------- /www.dianping.com/README.md: -------------------------------------------------------------------------------- 1 | ## 摘要 2 | * 大众点评字体反爬分析解决 3 | * 待更新 4 | 5 | >![](http://upload-images.jianshu.io/upload_images/5830895-6a8b96dde229c26c.png?imageMogr2/auto-orient/strip%7CimageView2/2/w/1240) 6 |
一个程序员日常分享,包括但不限于爬虫、Java后端技术,欢迎关注。 -------------------------------------------------------------------------------- /weibo/src/main/java/com/github/wycm/WeiboReplyCrawler.java: -------------------------------------------------------------------------------- 1 | package com.github.wycm; 2 | 3 | import org.apache.http.impl.cookie.BasicClientCookie; 4 | 5 | import java.util.Calendar; 6 | 7 | public class WeiboReplyCrawler { 8 | 9 | } 10 | -------------------------------------------------------------------------------- /music163/src/test/java/com/github/wycm/AppTest.java: -------------------------------------------------------------------------------- 1 | package com.github.wycm; 2 | 3 | import static org.junit.Assert.assertTrue; 4 | 5 | import org.junit.Test; 6 | 7 | /** 8 | * Unit test for simple App. 9 | */ 10 | public class AppTest 11 | { 12 | /** 13 | * Rigorous Test :-) 14 | */ 15 | @Test 16 | public void shouldAnswerWithTrue() 17 | { 18 | assertTrue( true ); 19 | } 20 | } 21 | -------------------------------------------------------------------------------- /music163/server-auto-deploy.sh: -------------------------------------------------------------------------------- 1 | sudo ps -ef |grep music163 | grep -v grep |awk '{print $2}'|xargs kill -9 2 | sudo ps -ef |grep chrome | grep -v grep |awk '{print $2}'|xargs kill -9 3 | 4 | cd /alidata/server/workspace/music163 5 | git pull origin master 6 | echo 'pull success' 7 | mvn clean 8 | mvn -Pprod package assembly:single 9 | echo 'mvn success' 10 | export DISPLAY=:99 11 | cd target 12 | java -jar music163-1.0-SNAPSHOT-jar-with-dependencies.jar & 13 | -------------------------------------------------------------------------------- /music163/src/main/resources/log4j-dev.properties: -------------------------------------------------------------------------------- 1 | log4j.rootLogger=INFO, stdout 2 | 3 | log4j.appender.stdout=org.apache.log4j.ConsoleAppender 4 | log4j.appender.stdout.Threshold=INFO 5 | log4j.appender.stdout.layout=org.apache.log4j.PatternLayout 6 | log4j.appender.stdout.layout.ConversionPattern=%d %p [%x,%t] - [%c] - %m%n 7 | 8 | log4j.appender.A1=org.apache.log4j.DailyRollingFileAppender 9 | log4j.appender.A1.Threshold=INFO 10 | log4j.appender.A1.layout=org.apache.log4j.PatternLayout 11 | log4j.appender.A1.layout.ConversionPattern=%d %p [%x,%t] - [%c] - <%m>%n 12 | log4j.appender.A1.DatePattern='.'yyyyMMdd 13 | log4j.appender.A1.File=d:/log/all.log 14 | -------------------------------------------------------------------------------- /music163/src/main/resources/log4j-prod.properties: -------------------------------------------------------------------------------- 1 | log4j.rootLogger=INFO, A1 2 | 3 | log4j.appender.stdout=org.apache.log4j.ConsoleAppender 4 | log4j.appender.stdout.Threshold=INFO 5 | log4j.appender.stdout.layout=org.apache.log4j.PatternLayout 6 | log4j.appender.stdout.layout.ConversionPattern=%d %p [%x,%t] - [%c] - %m%n 7 | 8 | log4j.appender.A1=org.apache.log4j.DailyRollingFileAppender 9 | log4j.appender.A1.Threshold=INFO 10 | log4j.appender.A1.layout=org.apache.log4j.PatternLayout 11 | log4j.appender.A1.layout.ConversionPattern=%d %p [%x,%t] - [%c] - <%m>%n 12 | log4j.appender.A1.DatePattern='.'yyyyMMdd 13 | log4j.appender.A1.File=/alidata/log/music163.log 14 | -------------------------------------------------------------------------------- /music163/src/main/resources/log4j.properties: -------------------------------------------------------------------------------- 1 | log4j.rootLogger=${log4j.rootLogger} 2 | 3 | log4j.appender.stdout=${log4j.appender.stdout} 4 | log4j.appender.stdout.Threshold=${log4j.appender.stdout.Threshold} 5 | log4j.appender.stdout.layout=${log4j.appender.stdout.layout} 6 | log4j.appender.stdout.layout.ConversionPattern=${log4j.appender.stdout.layout.ConversionPattern} 7 | 8 | log4j.appender.A1=${log4j.appender.A1} 9 | log4j.appender.A1.Threshold=${log4j.appender.A1.Threshold} 10 | log4j.appender.A1.layout=${log4j.appender.A1.layout} 11 | log4j.appender.A1.layout.ConversionPattern=${log4j.appender.A1.layout.ConversionPattern} 12 | log4j.appender.A1.DatePattern=${log4j.appender.A1.DatePattern} 13 | log4j.appender.A1.File=${log4j.appender.A1.File} -------------------------------------------------------------------------------- /www.goubanjia.com/pom.xml: -------------------------------------------------------------------------------- 1 | 3 | 4.0.0 4 | com.github.wycm 5 | goubanjia 6 | jar 7 | 1.0-SNAPSHOT 8 | goubanjia-crawler 9 | http://maven.apache.org 10 | 11 | 12 | 13 | 14 | org.jsoup 15 | jsoup 16 | 1.10.2 17 | 18 | 19 | 20 | 21 | 22 | 23 | org.apache.maven.plugins 24 | maven-compiler-plugin 25 | 2.0.2 26 | 27 | 1.7 28 | 1.7 29 | 30 | 31 | 32 | 33 | 34 | -------------------------------------------------------------------------------- /www.zhihu.com/pom.xml: -------------------------------------------------------------------------------- 1 | 3 | 4.0.0 4 | com.github.wycm 5 | zhihu 6 | jar 7 | 1.0-SNAPSHOT 8 | zhihu-crawler 9 | http://maven.apache.org 10 | 11 | 12 | 13 | 14 | org.jsoup 15 | jsoup 16 | 1.10.2 17 | 18 | 19 | 20 | org.apache.httpcomponents 21 | httpclient 22 | 4.5 23 | 24 | 25 | 26 | 27 | 28 | 29 | org.apache.maven.plugins 30 | maven-compiler-plugin 31 | 2.0.2 32 | 33 | 1.7 34 | 1.7 35 | 36 | 37 | 38 | 39 | 40 | -------------------------------------------------------------------------------- /www.douban.com/pom.xml: -------------------------------------------------------------------------------- 1 | 3 | 4.0.0 4 | com.github.wycm 5 | douban 6 | jar 7 | 1.0-SNAPSHOT 8 | selenium-geetest-crack 9 | http://maven.apache.org 10 | 11 | 12 | 13 | org.seleniumhq.selenium 14 | selenium-server 15 | 3.0.1 16 | 17 | 18 | 19 | org.jsoup 20 | jsoup 21 | 1.7.2 22 | 23 | 24 | 25 | 26 | 27 | 28 | org.apache.maven.plugins 29 | maven-compiler-plugin 30 | 2.0.2 31 | 32 | 1.7 33 | 1.7 34 | 35 | 36 | 37 | 38 | 39 | -------------------------------------------------------------------------------- /www.dianping.com/pom.xml: -------------------------------------------------------------------------------- 1 | 3 | 4.0.0 4 | com.github.wycm 5 | dianping 6 | jar 7 | 1.0-SNAPSHOT 8 | dianping-crawler 9 | http://maven.apache.org 10 | 11 | 12 | 13 | org.jsoup 14 | jsoup 15 | 1.10.2 16 | 17 | 18 | org.apache.httpcomponents 19 | httpclient 20 | 4.5 21 | 22 | 23 | junit 24 | junit 25 | 4.12 26 | 27 | 28 | 29 | 30 | 31 | org.apache.maven.plugins 32 | maven-compiler-plugin 33 | 2.0.2 34 | 35 | 1.8 36 | 1.8 37 | 38 | 39 | 40 | 41 | 42 | -------------------------------------------------------------------------------- /www.dianping.com/src/test/java/com/github/wycm/DianpingCrawlerTest.java: -------------------------------------------------------------------------------- 1 | package com.github.wycm; 2 | 3 | import org.apache.http.client.methods.CloseableHttpResponse; 4 | import org.apache.http.client.methods.HttpGet; 5 | import org.apache.http.impl.client.HttpClients; 6 | import org.apache.http.util.EntityUtils; 7 | import org.jsoup.Jsoup; 8 | import org.jsoup.select.Elements; 9 | import org.junit.Test; 10 | 11 | import java.io.BufferedReader; 12 | import java.io.IOException; 13 | import java.io.InputStreamReader; 14 | import java.util.stream.Collectors; 15 | 16 | import static org.junit.Assert.*; 17 | 18 | /** 19 | * Created by wycm on 2018/11/19. 20 | */ 21 | public class DianpingCrawlerTest { 22 | @Test 23 | public void testJsoup(){ 24 | String s = new BufferedReader(new InputStreamReader(this.getClass().getResourceAsStream("/test.html"))).lines() 25 | .parallel().collect(Collectors.joining("\n")); 26 | Elements elements = Jsoup.parse(s).select("textPath[xlink:href='#1']"); 27 | System.out.println(elements); 28 | } 29 | @Test 30 | public void testHttp() throws IOException { 31 | CloseableHttpResponse response = HttpClients.createDefault().execute(new HttpGet("http://s3plus.meituan.net/v1/mss_0a06a471f9514fc79c981b5466f56b91/svgtextcss/807789f715a7caed8e7c2475dcf94e20.svg")); 32 | System.out.println(EntityUtils.toString(response.getEntity(), "utf-8")); 33 | } 34 | } -------------------------------------------------------------------------------- /weibo/pom.xml: -------------------------------------------------------------------------------- 1 | 3 | 4.0.0 4 | com.github.wycm 5 | weibo 6 | jar 7 | 1.0-SNAPSHOT 8 | selenium-geetest-crack 9 | http://maven.apache.org 10 | 11 | 12 | 13 | org.jsoup 14 | jsoup 15 | 1.7.2 16 | 17 | 18 | org.apache.httpcomponents 19 | httpclient 20 | 4.5 21 | 22 | 23 | log4j 24 | log4j 25 | 1.2.15 26 | 27 | 28 | com.jayway.jsonpath 29 | json-path 30 | 2.2.0 31 | 32 | 33 | 34 | 35 | 36 | org.apache.maven.plugins 37 | maven-compiler-plugin 38 | 2.0.2 39 | 40 | 1.7 41 | 1.7 42 | 43 | 44 | 45 | 46 | 47 | -------------------------------------------------------------------------------- /www.douban.com/src/main/java/com/github/wycm/DoubanCrawler.java: -------------------------------------------------------------------------------- 1 | package com.github.wycm; 2 | 3 | import org.jsoup.Jsoup; 4 | import org.jsoup.nodes.Document; 5 | import org.jsoup.nodes.Element; 6 | import org.jsoup.select.Elements; 7 | import org.openqa.selenium.By; 8 | import org.openqa.selenium.WebDriver; 9 | import org.openqa.selenium.WebElement; 10 | import org.openqa.selenium.chrome.ChromeDriver; 11 | import org.openqa.selenium.support.ui.ExpectedCondition; 12 | import org.openqa.selenium.support.ui.WebDriverWait; 13 | 14 | /** 15 | * 豆瓣selenium爬虫 16 | * 运行需要下载chromedirever,并修改代码中的chromedirver地址 17 | */ 18 | public class DoubanCrawler { 19 | private static WebDriver driver; 20 | static { 21 | System.setProperty("webdriver.chrome.driver", "D:/dev/selenium/chromedriver_V2.30/chromedriver_win32/chromedriver.exe"); 22 | driver = new ChromeDriver(); 23 | } 24 | public static void main(String[] args) throws InterruptedException { 25 | douban(); 26 | driver.quit(); 27 | } 28 | private static void douban(){ 29 | driver.get("https://book.douban.com/subject_search?search_text=%E4%BA%92%E8%81%94%E7%BD%91&cat=1001"); 30 | By by = By.cssSelector("div[id='root']"); 31 | waitForLoad(driver, by); 32 | String pageSource = driver.getPageSource(); 33 | Document document = Jsoup.parse(pageSource); 34 | Elements elements = document.select("a[class=title-text]"); 35 | for(Element element: elements){ 36 | System.out.println(element.text()); 37 | } 38 | } 39 | /** 40 | * 等待元素加载,10s超时 41 | * @param driver 42 | * @param by 43 | */ 44 | public static void waitForLoad(final WebDriver driver, final By by){ 45 | new WebDriverWait(driver, 10).until(new ExpectedCondition() { 46 | public Boolean apply(WebDriver d) { 47 | WebElement element = driver.findElement(by); 48 | if (element != null){ 49 | return true; 50 | } 51 | return false; 52 | } 53 | }); 54 | } 55 | } 56 | -------------------------------------------------------------------------------- /www.goubanjia.com/src/main/java/com/github/wycm/GoubanjiaCrawler.java: -------------------------------------------------------------------------------- 1 | package com.github.wycm; 2 | 3 | import org.jsoup.Jsoup; 4 | import org.jsoup.nodes.Document; 5 | import org.jsoup.nodes.Element; 6 | import org.jsoup.select.Elements; 7 | 8 | import java.io.IOException; 9 | import java.net.URL; 10 | 11 | public class GoubanjiaCrawler { 12 | public static void main(String[] args) throws IOException { 13 | Document document= Jsoup.parse(new URL("http://www.goubanjia.com/"), 10000); 14 | setPort(document); 15 | //获取class='table'的table的所有子节点tr 16 | Elements elements = document.select("table[class=table] tr"); 17 | for (int i = 1; i < elements.size(); i++){ 18 | //获取td节点 19 | Element td = elements.get(i).select("td").first(); 20 | /** 21 | * 查找所有style属性包含none字符串的标签(页面上未显示的标签),并移除 22 | * 包括以下两种 23 | * style=display: none; 24 | * style=display:none; 25 | */ 26 | for(Element none : td.select("[style*=none;]")){ 27 | none.remove(); 28 | } 29 | //移除空格 30 | String ipPort = td.text().replaceAll(" ", ""); 31 | //打印 32 | System.out.println(ipPort); 33 | } 34 | } 35 | 36 | /** 37 | * js代码port还原 38 | * @param doc 39 | */ 40 | private static void setPort(Document doc){ 41 | for (Element e : doc.select(".port")){//$('.port').each(function() { 42 | String a = e.text();//var a = $(this).html(); 43 | if(a.indexOf("*") != -0x1){//if (a.indexOf('*') != -0x1) { 44 | return; 45 | } 46 | String b = e.attr("class");//var b = $(this).attr('class'); 47 | b = b.split(" ")[0x1];//b = (b.split(" "))[0x1]; 48 | String[] c = b.split("");//var c = b.split(""); 49 | int d = b.length();//var d = c.length; 50 | StringBuilder f = new StringBuilder();//var f = []; 51 | for(int g = 0x0; g < d; g++){//for (var g = 0x0; g < d; g++) { 52 | f.append("ABCDEFGHIZ".indexOf(c[g]));//f.push('ABCDEFGHIZ'.indexOf(c[g])) 53 | } 54 | e.text(String.valueOf(Integer.valueOf(f.toString()) >> 0x3));//$(this).html(window.parseInt(f.join('')) >> 0x3) 55 | } 56 | } 57 | } -------------------------------------------------------------------------------- /www.zhihu.com/src/main/java/com/github/wycm/ZhihuCrawler.java: -------------------------------------------------------------------------------- 1 | package com.github.wycm; 2 | 3 | import org.apache.http.client.CookieStore; 4 | import org.apache.http.client.methods.CloseableHttpResponse; 5 | import org.apache.http.client.methods.HttpGet; 6 | import org.apache.http.client.protocol.HttpClientContext; 7 | import org.apache.http.impl.client.BasicCookieStore; 8 | import org.apache.http.impl.client.CloseableHttpClient; 9 | import org.apache.http.impl.client.HttpClients; 10 | import org.apache.http.impl.client.LaxRedirectStrategy; 11 | import org.apache.http.impl.cookie.BasicClientCookie; 12 | import org.apache.http.util.EntityUtils; 13 | import org.jsoup.Jsoup; 14 | import org.jsoup.nodes.Document; 15 | import org.jsoup.nodes.Element; 16 | import org.jsoup.select.Elements; 17 | 18 | import java.io.IOException; 19 | import java.net.URL; 20 | import java.util.Calendar; 21 | 22 | public class ZhihuCrawler { 23 | private final static String RAW_COOKIES = "拷贝浏览器知乎cookie至此"; 24 | private static final String userAgent = "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/57.0.2987.133 Safari/537.36"; 25 | private static final CloseableHttpClient httpClient = HttpClients.custom() 26 | .setUserAgent(userAgent) 27 | //设置post默认重定向 28 | .setRedirectStrategy(new LaxRedirectStrategy()) 29 | .build(); 30 | private static final HttpClientContext httpClientContext = HttpClientContext.create(); 31 | private static CookieStore cookieStore = new BasicCookieStore(); 32 | 33 | 34 | static { 35 | for (String rawCookie : RAW_COOKIES.split("; ")){ 36 | String[] s = rawCookie.split("="); 37 | BasicClientCookie cookie = new BasicClientCookie(s[0], s[1]); 38 | cookie.setDomain("zhihu.com"); 39 | cookie.setPath("/"); 40 | cookie.setSecure(false); 41 | cookie.setAttribute("domain", "zhihu.com"); 42 | Calendar calendar = Calendar.getInstance(); 43 | calendar.add(Calendar.DAY_OF_MONTH, +5); 44 | cookie.setExpiryDate(calendar.getTime()); 45 | cookieStore.addCookie(cookie); 46 | } 47 | httpClientContext.setCookieStore(cookieStore); 48 | } 49 | public static void main(String[] args) throws IOException { 50 | HttpGet httpGet = new HttpGet("https://www.zhihu.com"); 51 | CloseableHttpResponse response = httpClient.execute(httpGet, httpClientContext); 52 | String s = EntityUtils.toString(response.getEntity()); 53 | Document document = Jsoup.parse(s); 54 | Elements elements = document.select("div[class=ContentItem AnswerItem]"); 55 | for (Element e : elements){ 56 | // System.out.println(e); 57 | System.out.println(e.select("meta[itemprop=url]").first().attr("content")); 58 | System.out.println(e.select("meta[itemprop=name]").first().attr("content")); 59 | } 60 | } 61 | } -------------------------------------------------------------------------------- /weibo/src/main/java/com/github/wycm/WeiboCrawler.java: -------------------------------------------------------------------------------- 1 | package com.github.wycm; 2 | 3 | import com.jayway.jsonpath.DocumentContext; 4 | import com.jayway.jsonpath.JsonPath; 5 | import org.apache.http.client.methods.HttpPost; 6 | 7 | import java.io.IOException; 8 | import java.net.URLEncoder; 9 | import java.util.HashMap; 10 | import java.util.Map; 11 | import java.util.regex.Matcher; 12 | import java.util.regex.Pattern; 13 | 14 | /** 15 | * Created by wangyang on 2017/8/22. 16 | */ 17 | public class WeiboCrawler { 18 | private static final String CHECK_URL = "https://login.sina.com.cn/sso/prelogin.php?checkpin=1&entry=mweibo&su=MTMwODgyODA4NjA=&callback=jsonpcallback1503386116934"; 19 | private static final String LOGIN_URL = "https://passport.weibo.cn/sso/login"; 20 | private static final String POST_ARGS = "username=13268037201&password=password&savestate=1&r=http%3A%2F%2Fm.weibo.cn%2F&ec=0&pagerefer=https%3A%2F%2Fpassport.weibo.cn%2Fsignin%2Fwelcome%3Fentry%3Dmweibo%26r%3Dhttp%253A%252F%252Fm.weibo.cn%252F&entry=mweibo&wentry=&loginfrom=&client_id=&code=&qq=&mainpageflag=1&hff=&hfp="; 21 | private static final String KEYWORD_ARGS = "type=all&queryVal=${keyword}&luicode=10000011&lfid=106003type%3D1&title=${keyword}&containerid=100103type%3D1%26q%3D${keyword}"; 22 | /** 23 | * 搜索url 24 | */ 25 | public static void main(String[] args) throws IOException { 26 | String searchUrl = "https://m.weibo.cn/api/container/getIndex"; 27 | String keyword = "联想电脑"; 28 | System.out.println(HttpClientUtil.get(CHECK_URL)); 29 | HttpPost post = new HttpPost(LOGIN_URL); 30 | //该header必须要 31 | post.addHeader("Referer", "https://passport.weibo.cn/signin/login?entry=mweibo&res=wel&wm=3349&r=http%3A%2F%2Fm.weibo.cn%2F"); 32 | HttpClientUtil.setHttpPostParams(post, queryToMap(POST_ARGS)); 33 | String res = HttpClientUtil.getWebPage(post); 34 | System.out.println(res); 35 | System.out.println(HttpClientUtil.get("https://m.weibo.cn/")); 36 | 37 | 38 | String searchArgs = KEYWORD_ARGS.replaceAll("\\$\\{keyword\\}", URLEncoder.encode(keyword, "utf-8")); 39 | searchUrl = searchUrl + "?" + searchArgs; 40 | String searchRes = HttpClientUtil.get(searchUrl); 41 | Pattern pattern = Pattern.compile("idstr\":\"(\\d+)\""); 42 | Matcher matcher = pattern.matcher(searchRes); 43 | while (matcher.find()){ 44 | String commentId = matcher.group(1); 45 | System.out.println(commentId); 46 | } 47 | // String result = HttpClientUtil.get("https://m.weibo.cn/api/comments/show?id=4154417035431509&page=1"); 48 | // System.out.println(result); 49 | } 50 | /** 51 | * returns the url parameters in a map 52 | * @param query 53 | * @return map 54 | */ 55 | public static Map queryToMap(String query){ 56 | if (query == null){ 57 | query = ""; 58 | } 59 | Map result = new HashMap(); 60 | for (String param : query.split("&")) { 61 | String pair[] = param.split("="); 62 | if (pair.length>1) { 63 | result.put(pair[0], pair[1]); 64 | }else{ 65 | result.put(pair[0], ""); 66 | } 67 | } 68 | return result; 69 | } 70 | } 71 | -------------------------------------------------------------------------------- /music163/pom.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 5 | 4.0.0 6 | 7 | com.github.wycm 8 | music163 9 | 1.0-SNAPSHOT 10 | 11 | music163 12 | http://www.example.com 13 | 14 | 15 | UTF-8 16 | 1.8 17 | 1.8 18 | 19 | 20 | 21 | 22 | org.seleniumhq.selenium 23 | selenium-server 24 | 3.0.1 25 | 26 | 27 | org.slf4j 28 | slf4j-api 29 | 1.7.6 30 | 31 | 32 | org.slf4j 33 | slf4j-log4j12 34 | 1.7.6 35 | 36 | 37 | junit 38 | junit 39 | 4.11 40 | test 41 | 42 | 43 | redis.clients 44 | jedis 45 | 2.5.2 46 | 47 | 48 | 49 | 50 | 51 | 52 | src/main/resources 53 | 54 | **/* 55 | 56 | 57 | 58 | src/main/resources 59 | 60 | log4j.properties 61 | 62 | true 63 | 64 | 65 | 66 | 67 | 70 | 71 | org.apache.maven.plugins 72 | maven-assembly-plugin 73 | 2.5.5 74 | 75 | 76 | 77 | com.github.wycm.Music163 78 | 79 | 80 | 81 | jar-with-dependencies 82 | 83 | 84 | 85 | 86 | 87 | 88 | 89 | 90 | dev 91 | 92 | 93 | src/main/resources/log4j-dev.properties 94 | 95 | 96 | 97 | true 98 | 99 | 100 | 101 | prod 102 | 103 | 104 | src/main/resources/log4j-prod.properties 105 | 106 | 107 | 108 | 109 | 110 | -------------------------------------------------------------------------------- /music163/src/main/java/com/github/wycm/Music163.java: -------------------------------------------------------------------------------- 1 | package com.github.wycm; 2 | 3 | import org.openqa.selenium.*; 4 | import org.openqa.selenium.chrome.ChromeDriver; 5 | import org.openqa.selenium.chrome.ChromeOptions; 6 | import org.slf4j.Logger; 7 | import org.slf4j.LoggerFactory; 8 | 9 | import java.util.*; 10 | import java.util.concurrent.TimeUnit; 11 | import java.util.regex.Matcher; 12 | import java.util.regex.Pattern; 13 | 14 | /** 15 | * Created by wycm 16 | */ 17 | public class Music163 { 18 | private static Logger logger = LoggerFactory.getLogger(Music163.class); 19 | 20 | //拷贝登录成功的浏览器原始cookie 21 | private final static String RAW_COOKIES = "cookie1=value1; cookie2=value2"; 22 | private final static String CHROME_DRIVER_PATH = "/Users/wangyang/Downloads/chromedriver"; 23 | //歌曲列表id 24 | private static String startId = "22336453"; 25 | 26 | 27 | private static String userId = null; 28 | private static Set playListSet = new HashSet<>(); 29 | private static Pattern pattern = Pattern.compile("(.*?)(.*?)"); 30 | private static Pattern songName = Pattern.compile("class=\"f-thide name fc1 f-fl\" title=\"(.*?)\""); 31 | private static ChromeOptions chromeOptions = new ChromeOptions(); 32 | private static WebDriver driver = null; 33 | static { 34 | System.setProperty("webdriver.chrome.driver", CHROME_DRIVER_PATH); 35 | chromeOptions.addArguments("--no-sandbox"); 36 | } 37 | public static void main(String[] args) throws InterruptedException { 38 | while (true){ 39 | try { 40 | driver = new ChromeDriver(chromeOptions); 41 | playListSet.add(startId); 42 | invoke(); 43 | } catch (Exception e){ 44 | logger.error(e.getMessage(), e); 45 | } finally { 46 | driver.quit(); 47 | } 48 | Thread.sleep(1000 * 10); 49 | } 50 | } 51 | 52 | /** 53 | * 初始化cookies 54 | */ 55 | private static void initCookies(){ 56 | Arrays.stream(RAW_COOKIES.split("; ")).forEach(rawCookie -> { 57 | String[] ss = rawCookie.split("="); 58 | Cookie cookie = new Cookie.Builder(ss[0], ss[1]).domain(".163.com").build(); 59 | driver.manage().addCookie(cookie); 60 | }); 61 | } 62 | private static void invoke() throws InterruptedException { 63 | driver.manage().timeouts().implicitlyWait(5, TimeUnit.SECONDS); 64 | driver.manage().timeouts().pageLoadTimeout(15, TimeUnit.SECONDS); 65 | String s = null; 66 | driver.get("http://music.163.com/"); 67 | initCookies(); 68 | driver.get("http://music.163.com/"); 69 | s = driver.getPageSource(); 70 | userId = group(s, "userId:(\\d+)", 1); 71 | driver.get("https://music.163.com/#/playlist?id=" + startId); 72 | driver.switchTo().frame("contentFrame"); 73 | WebElement element = driver.findElement(By.cssSelector("[id=content-operation]>a:first-child")); 74 | element.click(); 75 | ((JavascriptExecutor) driver).executeScript("window.open('about:blank')"); 76 | ArrayList tabs = new ArrayList(driver.getWindowHandles()); 77 | driver.switchTo().window(tabs.get(0)); 78 | driver.switchTo().defaultContent(); 79 | int i = 0; 80 | String lastSongName = ""; 81 | int count = 0; 82 | while (true){ 83 | if(i > Integer.MAX_VALUE - 2){ 84 | break; 85 | } 86 | i++; 87 | s = driver.getPageSource(); 88 | driver.switchTo().window(tabs.get(1)); //switches to new tab 89 | String songs = null; 90 | try{ 91 | driver.get("https://music.163.com/user/home?id=" + userId); 92 | driver.switchTo().frame("contentFrame"); 93 | songs = group(driver.getPageSource(), "累积听歌(\\d+)首", 1); 94 | } catch (TimeoutException e){ 95 | logger.error(e.getMessage(), e); 96 | } 97 | driver.switchTo().window(tabs.get(0)); 98 | Matcher matcher = pattern.matcher(s); 99 | Matcher songNameMatcher = songName.matcher(s); 100 | if (matcher.find() && songNameMatcher.find()){ 101 | String songNameStr = songNameMatcher.group(1); 102 | if (!songNameStr.equals(lastSongName)){ 103 | count++; 104 | lastSongName = songNameStr; 105 | } 106 | logger.info(songNameStr + "-" + matcher.group(1) + matcher.group(2) + "---当前播放第" + count + "首歌曲, 累计听歌:" + songs); 107 | } else { 108 | logger.info("解析歌曲播放记录或歌曲名失败"); 109 | } 110 | Thread.sleep(1000 * 30); 111 | } 112 | } 113 | public static String group(String str, String regex, int index) { 114 | Pattern pattern = Pattern.compile(regex); 115 | Matcher matcher = pattern.matcher(str); 116 | return matcher.find() ? matcher.group(index) : ""; 117 | } 118 | } 119 | -------------------------------------------------------------------------------- /www.goubanjia.com/README.md: -------------------------------------------------------------------------------- 1 | ## 摘要 2 | * 上一篇以知乎网为例简单分享网络请求分析。这一篇主要分享一种应对反爬虫的方法,前端数据混淆。 3 | ## 目的 4 | * 之前写https://github.com/wycm/zhihu-crawler项目的时候,需要用到免费的http代理,然后找到了这个 http://www.goubanjia.com/ 这个网站。现在需要把这个网站上的ip和port爬取下来,有兴趣的朋友也可以尝试自己爬取一下。 5 | ## 开始 6 | * 打开这个网站首页,然后控制台查看ip和port的对应标签。 7 | ![](http://upload-images.jianshu.io/upload_images/5830895-166662036a68a8ac.png?imageMogr2/auto-orient/strip%7CimageView2/2/w/1240) 8 | * 如上图(图一),从控制台的标签中可以看出ip加了一些无关不显示的标签来混淆数据,这里混淆的原理其实很简单,通过标签的style="display:none"属性来达到混淆的目的,也就是包含这个属性的标签是不会显示在页面上的。知道了这一点就比较好处理了,只需要在解析的时候把包含style="display:none"属性的标签去掉。就可以轻松的拿到ip和port数据了。 9 | * 代码如下 10 | ``` 11 | package com.cnblogs.wycm; 12 | 13 | import org.jsoup.Jsoup; 14 | import org.jsoup.nodes.Document; 15 | import org.jsoup.nodes.Element; 16 | import org.jsoup.select.Elements; 17 | import java.io.IOException; 18 | import java.net.URL; 19 | 20 | /** 21 | * 22 | * 数据的解析采用的是Jsoup框架,Jsoup是一个操作HTML标签的Java库,它提供了非常方便的API来提取和操纵库,支持类似jquery的选择器来查找标签。 23 | * 由于请求比较单一,这里的网络请求并没有采用上一篇所使用HttpClient框架。直接通过Jsoup来执行http请求的。 24 | * 关于Jsoup的使用可以参考http://www.open-open.com/jsoup/ 25 | * 26 | */ 27 | public class Chapter1 { 28 | public static void main(String[] args) throws IOException { 29 | Document document= Jsoup.parse(new URL("http://www.goubanjia.com/"), 10000); 30 | //获取class='table'的table的所有子节点tr 31 | Elements elements = document.select("table[class=table] tr"); 32 | for (int i = 1; i < elements.size(); i++){ 33 | //获取td节点 34 | Element td = elements.get(i).select("td").first(); 35 | /** 36 | * 查找所有style属性包含none字符串的标签(页面上未显示的标签),并移除 37 | * 包括以下两种 38 | * style=display: none; 39 | * style=display:none; 40 | */ 41 | for(Element none : td.select("[style*=none;]")){ 42 | none.remove(); 43 | } 44 | //移除空格 45 | String ipPort = td.text().replaceAll(" ", ""); 46 | //打印 47 | System.out.println(ipPort); 48 | } 49 | } 50 | } 51 | /* 52 | 第一次运行打印结果: 53 | 183.129.246.228:8132 54 | 222.92.136.206:8987 55 | 54.238.186.100:8988 56 | ... 57 | 第二次运行打印结果: 58 | 183.129.246.228:8377 59 | 222.92.136.206:9059 60 | 54.238.186.100:8622 61 | ... 62 | */ 63 | ``` 64 | * ip地址能够准确的拿到了,却发现port被做了混淆,而且每次返回的port还在动态改变。大家可以通过把浏览器的JavaScrip脚本关闭后,然后刷新这个网页。会发现每次的port都不一样。我们每次看到的正确port都是通过JavaScript脚本处理后的。如果采用普通爬虫的方式拿到的port都是错误的。现在要想拿到正确的port,可以通过分析它JavaScrip脚本还原数据的逻辑。 65 | * 同样打开控制台->选择Sources->选择一行js代码打断点(点击行编号),如下图 66 | ![](http://upload-images.jianshu.io/upload_images/5830895-aa150ab7dee00d09.png?imageMogr2/auto-orient/strip%7CimageView2/2/w/1240) 67 | * 刷新网页—>页面Paused in debugger—>选择Elements->右键td节点->Break on...->subtree modifications。这两个步骤就是在设置断点调试,也就是在td节点发生改变的时候paused。 68 | ![](http://upload-images.jianshu.io/upload_images/5830895-17771b34ebc43f3c.png?imageMogr2/auto-orient/strip%7CimageView2/2/w/1240) 69 | * 选择Sources->F8(继续执行),这个时候又会有一次pause,也就是js脚本在还原正确port的时候(如下图) 70 | ![](http://upload-images.jianshu.io/upload_images/5830895-ad17fd4ba7733441.png?imageMogr2/auto-orient/strip%7CimageView2/2/w/1240) 71 | * 函数的调用栈有好多层,如何快速定位哪一个函数的技巧就是,看它局部变量表的变量变化,因为这里是port在发生改变,然后找到对应变量和对应逻辑函数。简单分析可以确定到port发生改变的函数是一个匿名函数,如下图 72 | ![](http://upload-images.jianshu.io/upload_images/5830895-55a376f91fee8519.png?imageMogr2/auto-orient/strip%7CimageView2/2/w/1240) 73 | * 格式化后,代码如下: 74 | ``` 75 | var _$ = ['\x2e\x70\x6f\x72\x74', "\x65\x61\x63\x68", "\x68\x74\x6d\x6c", "\x69\x6e\x64\x65\x78\x4f\x66", '\x2a', "\x61\x74\x74\x72", '\x63\x6c\x61\x73\x73', "\x73\x70\x6c\x69\x74", "\x20", "", "\x6c\x65\x6e\x67\x74\x68", "\x70\x75\x73\x68", '\x41\x42\x43\x44\x45\x46\x47\x48\x49\x5a', "\x70\x61\x72\x73\x65\x49\x6e\x74", "\x6a\x6f\x69\x6e", '']; 76 | $(function() { 77 | $(_$[0])[_$[1]](function() { 78 | var a = $(this)[_$[2]](); 79 | if (a[_$[3]](_$[4]) != -0x1) { 80 | return 81 | } 82 | ;var b = $(this)[_$[5]](_$[6]); 83 | try { 84 | b = (b[_$[7]](_$[8]))[0x1]; 85 | var c = b[_$[7]](_$[9]); 86 | var d = c[_$[10]]; 87 | var f = []; 88 | for (var g = 0x0; g < d; g++) { 89 | f[_$[11]](_$[12][_$[3]](c[g])) 90 | } 91 | ;$(this)[_$[2]](window[_$[13]](f[_$[14]](_$[15])) >> 0x3) 92 | } catch (e) {} 93 | }) 94 | }) 95 | ``` 96 | * 还原后如下: 97 | ``` 98 | var _$ = ['.port', "each", "html", "indexOf", '*', "attr", 'class', "split", " ", "", "length", "push", 'ABCDEFGHIZ', "parseInt", "join", '']; 99 | $(function() { 100 | $('.port').each(function() { 101 | var a = $(this).html(); 102 | if (a.indexOf('*') != -0x1) { 103 | return 104 | } 105 | ;var b = $(this).attr('class'); 106 | try { 107 | b = (b.split(" "))[0x1]; 108 | var c = b.split(""); 109 | var d = c.length; 110 | var f = []; 111 | for (var g = 0x0; g < d; g++) { 112 | f.push('ABCDEFGHIZ'.indexOf(c[g])) 113 | } 114 | ;$(this).html(window.parseInt(f.join('')) >> 0x3) 115 | } catch (e) {} 116 | }) 117 | }) 118 | ``` 119 | * 这段代码的逻辑,获取port标签的class属性值,取出属性中后面的几个大写字母,遍历该字符串,找出每次字符在'ABCDEFGHIZ'这个字符串中的索引,然后parseInt转换为整数,然后进行右移3位的操作。 120 | * 完整代码实现 121 | ``` 122 | package com.cnblogs.wycm; 123 | 124 | import org.jsoup.Jsoup; 125 | import org.jsoup.nodes.Document; 126 | import org.jsoup.nodes.Element; 127 | import org.jsoup.select.Elements; 128 | 129 | import java.io.IOException; 130 | import java.net.URL; 131 | 132 | public class Chapter2 { 133 | public static void main(String[] args) throws IOException { 134 | Document document= Jsoup.parse(new URL("http://www.goubanjia.com/"), 10000); 135 | setPort(document); 136 | //获取class='table'的table的所有子节点tr 137 | Elements elements = document.select("table[class=table] tr"); 138 | for (int i = 1; i < elements.size(); i++){ 139 | //获取td节点 140 | Element td = elements.get(i).select("td").first(); 141 | /** 142 | * 查找所有style属性包含none字符串的标签(页面上未显示的标签),并移除 143 | * 包括以下两种 144 | * style=display: none; 145 | * style=display:none; 146 | */ 147 | for(Element none : td.select("[style*=none;]")){ 148 | none.remove(); 149 | } 150 | //移除空格 151 | String ipPort = td.text().replaceAll(" ", ""); 152 | //打印 153 | System.out.println(ipPort); 154 | } 155 | } 156 | 157 | /** 158 | * js代码port还原 159 | * @param doc 160 | */ 161 | private static void setPort(Document doc){ 162 | for (Element e : doc.select(".port")){//$('.port').each(function() { 163 | String a = e.text();//var a = $(this).html(); 164 | if(a.indexOf("*") != -0x1){//if (a.indexOf('*') != -0x1) { 165 | return; 166 | } 167 | String b = e.attr("class");//var b = $(this).attr('class'); 168 | b = b.split(" ")[0x1];//b = (b.split(" "))[0x1]; 169 | String[] c = b.split("");//var c = b.split(""); 170 | int d = b.length();//var d = c.length; 171 | StringBuilder f = new StringBuilder();//var f = []; 172 | for(int g = 0x0; g < d; g++){//for (var g = 0x0; g < d; g++) { 173 | f.append("ABCDEFGHIZ".indexOf(c[g]));//f.push('ABCDEFGHIZ'.indexOf(c[g])) 174 | } 175 | e.text(String.valueOf(Integer.valueOf(f.toString()) >> 0x3));//$(this).html(window.parseInt(f.join('')) >> 0x3) 176 | } 177 | } 178 | } 179 | ``` 180 | * maven依赖 181 | ``` 182 | 183 | org.jsoup 184 | jsoup 185 | 1.10.2 186 | 187 | ``` 188 | ## 总结 189 | * 该篇文章简单分项了下如何应对前端混淆的反爬虫。关于这种反爬虫,还有其它的一些应对方式。如采用无头浏览器的方式,比如phantomjs框架。这种无头浏览器原本是用来做自动化测试的。它是基于webkit内核的,所以它可以较容易的爬取这种前端混淆的这种网站。一般来说浏览器能够正常访问到的数据,这种方式也可以比较容易爬取这些数据。当然这种方式的最大问题就是效率比较低。因为这种方式它每加载一个页面,都需要下载它的附加资源,如js脚本,脚本下载完成后,还要去执行js脚本。 190 | * 我这里采用的方式是阅读js代码,得出前端混淆的逻辑,然后再通过目标语言来实现对应逻辑。这种方式如果针对一些简单的加密混淆还是很有用的。但是当遇到一些大型复杂的网站,如百度、微博等,需要抓取登录后的数据。这时候需要来手动模拟登录,相对来说,这种网站的模拟登录会更复杂,找各种登录参数来源。都会耗费大量精力。分析请求的成本会比较高。这种方式的优点就是爬取速度快,只获取目标数据。不需要额外网络请求成本。 191 | 192 | >![](http://upload-images.jianshu.io/upload_images/5830895-6a8b96dde229c26c.png?imageMogr2/auto-orient/strip%7CimageView2/2/w/1240) 193 |
一个程序员日常分享,包括但不限于爬虫、Java后端技术,欢迎关注。 -------------------------------------------------------------------------------- /weibo/src/main/java/com/github/wycm/HttpClientUtil.java: -------------------------------------------------------------------------------- 1 | package com.github.wycm; 2 | 3 | import org.apache.http.*; 4 | import org.apache.http.client.CookieStore; 5 | import org.apache.http.client.HttpRequestRetryHandler; 6 | import org.apache.http.client.config.CookieSpecs; 7 | import org.apache.http.client.config.RequestConfig; 8 | import org.apache.http.client.entity.UrlEncodedFormEntity; 9 | import org.apache.http.client.methods.CloseableHttpResponse; 10 | import org.apache.http.client.methods.HttpGet; 11 | import org.apache.http.client.methods.HttpPost; 12 | import org.apache.http.client.methods.HttpRequestBase; 13 | import org.apache.http.client.protocol.HttpClientContext; 14 | import org.apache.http.config.ConnectionConfig; 15 | import org.apache.http.config.Registry; 16 | import org.apache.http.config.RegistryBuilder; 17 | import org.apache.http.config.SocketConfig; 18 | import org.apache.http.conn.ConnectTimeoutException; 19 | import org.apache.http.conn.socket.ConnectionSocketFactory; 20 | import org.apache.http.conn.socket.PlainConnectionSocketFactory; 21 | import org.apache.http.conn.ssl.SSLConnectionSocketFactory; 22 | import org.apache.http.conn.ssl.TrustStrategy; 23 | import org.apache.http.cookie.Cookie; 24 | import org.apache.http.impl.client.*; 25 | import org.apache.http.impl.conn.DefaultProxyRoutePlanner; 26 | import org.apache.http.impl.conn.PoolingHttpClientConnectionManager; 27 | import org.apache.http.message.BasicNameValuePair; 28 | import org.apache.http.protocol.HttpContext; 29 | import org.apache.http.ssl.SSLContexts; 30 | import org.apache.log4j.Logger; 31 | 32 | import javax.net.ssl.SSLContext; 33 | import javax.net.ssl.SSLException; 34 | import java.io.*; 35 | import java.net.UnknownHostException; 36 | import java.nio.charset.CodingErrorAction; 37 | import java.security.KeyStore; 38 | import java.security.cert.CertificateException; 39 | import java.security.cert.X509Certificate; 40 | import java.util.ArrayList; 41 | import java.util.List; 42 | import java.util.Map; 43 | 44 | /** 45 | * HttpClient工具类 46 | */ 47 | public class HttpClientUtil { 48 | private static Logger logger = Logger.getLogger(HttpClientUtil.class); 49 | private static CloseableHttpClient httpClient; 50 | private final static HttpClientContext httpClientContext = HttpClientContext.create(); 51 | private final static String userAgent = "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; QQDownload 1.7; .NET CLR 1.1.4322; CIBA; .NET CLR 2.0.50727)"; 52 | private static HttpHost proxy; 53 | private static RequestConfig requestConfig; 54 | static { 55 | init(); 56 | } 57 | private static void init() { 58 | try { 59 | SSLContext sslContext = 60 | SSLContexts.custom() 61 | .loadTrustMaterial(KeyStore.getInstance(KeyStore.getDefaultType()), new TrustStrategy() { 62 | @Override 63 | public boolean isTrusted(X509Certificate[] chain, String authType) 64 | throws CertificateException { 65 | return true; 66 | } 67 | }).build(); 68 | SSLConnectionSocketFactory sslSFactory = 69 | new SSLConnectionSocketFactory(sslContext); 70 | Registry socketFactoryRegistry = 71 | RegistryBuilder.create() 72 | .register("http", PlainConnectionSocketFactory.INSTANCE).register("https", sslSFactory) 73 | .build(); 74 | 75 | PoolingHttpClientConnectionManager connManager = 76 | new PoolingHttpClientConnectionManager(socketFactoryRegistry); 77 | 78 | SocketConfig socketConfig = SocketConfig.custom().setTcpNoDelay(true).build(); 79 | connManager.setDefaultSocketConfig(socketConfig); 80 | 81 | ConnectionConfig connectionConfig = 82 | ConnectionConfig.custom().setMalformedInputAction(CodingErrorAction.IGNORE) 83 | .setUnmappableInputAction(CodingErrorAction.IGNORE).setCharset(Consts.UTF_8).build(); 84 | connManager.setDefaultConnectionConfig(connectionConfig); 85 | connManager.setMaxTotal(300); 86 | connManager.setDefaultMaxPerRoute(100); 87 | 88 | HttpRequestRetryHandler retryHandler = new HttpRequestRetryHandler() { 89 | @Override 90 | public boolean retryRequest(IOException exception, int executionCount, HttpContext context) { 91 | if (executionCount > 0) { 92 | return false; 93 | } 94 | if (exception instanceof InterruptedIOException) { 95 | return true; 96 | } 97 | if (exception instanceof ConnectTimeoutException) { 98 | return true; 99 | } 100 | if (exception instanceof UnknownHostException) { 101 | return true; 102 | } 103 | if (exception instanceof SSLException) { 104 | return true; 105 | } 106 | HttpRequest request = HttpClientContext.adapt(context).getRequest(); 107 | if (!(request instanceof HttpEntityEnclosingRequest)) { 108 | return true; 109 | } 110 | return false; 111 | } 112 | }; 113 | 114 | HttpClientBuilder httpClientBuilder = 115 | HttpClients.custom().setConnectionManager(connManager).setRetryHandler(retryHandler) 116 | //设置post默认重定向 117 | .setRedirectStrategy(new LaxRedirectStrategy()) 118 | .setDefaultCookieStore(new BasicCookieStore()).setUserAgent(userAgent); 119 | if (proxy != null) { 120 | httpClientBuilder.setRoutePlanner(new DefaultProxyRoutePlanner(proxy)).build(); 121 | } 122 | httpClient = httpClientBuilder.build(); 123 | 124 | requestConfig = RequestConfig.custom().setSocketTimeout(10000). 125 | setConnectTimeout(10000). 126 | setConnectionRequestTimeout(10000). 127 | setCookieSpec(CookieSpecs.STANDARD). 128 | build(); 129 | } catch (Exception e) { 130 | logger.error(e.getMessage()); 131 | } 132 | } 133 | public static String get(String url) throws IOException { 134 | HttpGet request = new HttpGet(url); 135 | return getWebPage(request, null, "UTF-8", false); 136 | } 137 | public static String get(HttpRequestBase request, RequestConfig config) throws IOException { 138 | return getWebPage(request, config, "UTF-8", false); 139 | } 140 | public static String getWebPage(HttpRequestBase request) throws IOException { 141 | return getWebPage(request, null, "UTF-8", false); 142 | } 143 | public static String getWebPage(HttpRequestBase request, RequestConfig config) throws IOException { 144 | return getWebPage(request, config, "UTF-8", false); 145 | } 146 | /** 147 | * 148 | * @param request 请求 149 | * @param encoding 字符编码 150 | * @param isPrintConsole 是否打印到控制台 151 | * @return 网页内容 152 | */ 153 | public static String getWebPage(HttpRequestBase request, 154 | RequestConfig config, 155 | String encoding, 156 | boolean isPrintConsole) throws IOException { 157 | CloseableHttpResponse response = null; 158 | if (config != null){ 159 | request.setConfig(config); 160 | } 161 | else { 162 | request.setConfig(requestConfig); 163 | } 164 | response = httpClient.execute(request, httpClientContext); 165 | logger.info("status---" + response.getStatusLine().getStatusCode()); 166 | BufferedReader rd = null; 167 | StringBuilder webPage = null; 168 | try { 169 | rd = new BufferedReader( 170 | new InputStreamReader(response.getEntity().getContent(),encoding)); 171 | String line = ""; 172 | webPage = new StringBuilder(); 173 | while((line = rd.readLine()) != null) { 174 | webPage.append(line); 175 | if(isPrintConsole){ 176 | System.out.println(line); 177 | } 178 | } 179 | } catch (IOException e) { 180 | e.printStackTrace(); 181 | } 182 | request.releaseConnection(); 183 | response.close(); 184 | return webPage.toString(); 185 | } 186 | /** 187 | * 设置request请求参数 188 | * @param request 189 | * @param params 190 | */ 191 | public static void setHttpPostParams(HttpPost request,Map params) throws UnsupportedEncodingException { 192 | List formParams = new ArrayList(); 193 | for (String key : params.keySet()) { 194 | formParams.add(new BasicNameValuePair(key,params.get(key))); 195 | } 196 | UrlEncodedFormEntity entity = new UrlEncodedFormEntity(formParams, "utf-8"); 197 | request.setEntity(entity); 198 | } 199 | } 200 | -------------------------------------------------------------------------------- /www.dianping.com/src/test/resources/test.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 洗冈五辅结葱练嫁犯政外素永闯词聚赤粥咱帆反烈舞悬航西融芦跳唐截户泻阶陕厕宁拐久闭毁 9 | 议勾昆构败捐介厅叙樱奥膜单炼响古各饶愧泼凳铅孙究省灭艰穿坐肾蚂编烟珍膝蒸看劝 10 | 较雕批虚狮沟乳嘴滚港精嗽垒盯建我贯肚准斑庸勉闷贝侄挣鸽略佛证领灰 11 | 遮欣沃仅器挪崭冰胡事站乓曾恐区里愤苏司倒疤丑合谜呼浑纳广劳脆 12 | 它枕位独鹿糖畅标宅静占沫劫宜驳辽钞清酬策抢犁王村粗玉喜篮蝶仿金衫腿馆搁影埋胶维龄靠腊 13 | 摧蚀嫌云官致易雨傲鸟吊样杯窃迎伍棵坚爱目唱向曲帖否劈求邮潜灶胆奖柔毕鸭仰胸州山晋插令 14 | 卵近刷绿几屈止件牌捏兼愁趴纱篇烫正霜声倘眨谷株 15 | 扇攻醋茎哑口庙整熔杂艺拿吧辣中减都似愿虎仁槐驻程寿杜忆岸交键污华科帜宿故潮乱 16 | 榴摆冬架稠线课旧稿盘暖漂薪协坛午最信驰轧坏溪缝瓦奋姑扎当僻殃康昏直夕炕害叔歉络着遍 17 | 跌值袭抛贵水脸尊救恢辩匠柜雪伯充悼葡裳笋蹄钉核庆竞 18 | 边御柄韵昼屿镇再码达幸槽配百酷瞒叉宣揭溉尝 19 | 弱屋播数哲亏闲晌提规导怪顺描著往花秤啄望贪油惭买椒绵舟住 20 | 遗茂回察贷引刀晃梅咏秩济色级放礼始穗质佳筒竭弃佣草天夜浸暂蜘盐降圆板晓讨赞猪召益 21 | 室鹅输纤布药第痕尸钩渐估很厘枣野妈池阻费姜陆讯土朽叠方浓勇智入氧桥酸塘改腥盟被 22 | 姥葛模轨戚励揪鸣芝赶身秧旱群爆族吐怀南电婚搭兆呢越爪重 23 | 丧伟则技快恩浊涉步峰圾思惨孝举赏逢裁驾滴涝旗羞暗煮石蔬 24 | 街活掘莲蔽剥懒拢舍优律菠社井演兄趣产德文萌乙替昨瘦剪匹匪挨振舌圈喉但 25 | 银秃畜柏宽籍僚贿衬辞砍补圣市浅粒妨丈习橘子那侧力奔软继廉盼箭仓 26 | 死惜熄甘骤搏爽吓算效红逃凶芒摇语什罗秒要钳之血干欺哄捡榆瓶笨唤滋皱羽映煎风 27 | 物麦典洞隙像坟含箱而芬面森拦沸衡台括与她匙监硬誓心镰掌伞 28 | 筹岭况剑搂护殿佩块翠工一索渠倡此笼侨想热阳骄并腐全间妖锹属植捧探童蜂 29 | 贤皇刮趟进贞绢松沾飘奉见呈赛射找蔑姿铲触橡贡扶查绪掩赢震传闪廊粮者抵咬趋哨种翅裤棍堪 30 | 燥折舅痒环性另蹦纲甲原巧助猜堤泪酿券蚁免鬼择辉蝴高谢摔杏两说喝击约嫂守宇案俘渣喇酱 31 | 磁吩唇喷行意倦出囊虫默无嚼副乞缩险新杀纸葵跑肠孟俭诵毅屡章违挂奸听拘傻母乐根半其功 32 | 左够惠问栏时谦翁待诱离积钥项授量驱设晚脚罪灾榜犹馋谎九吗斤津驶炒锐怜霸咐 33 | 实乌迈妄劲赵茶落定颈审宝叶禁虹淡巷渡某督览叫喊伐狸筝支蛇苗迁 34 | 魂话尘后眯老绩发烤抄魔晶蚊拉火校薄盗斯肉临垮睡至陷紧威期争怨取塔版爹订狼奴辜记 35 | 扩统售堂延忘本材务洒繁弊刊衰短豆嗓店福生众将阁图的等币番卖伸 36 | 脑秆栗才旁葬份县疯冶个躬舱冒钓敏凤系巾薯芳仪泽裹床牲句狡踢陈汁液狗渔撇依洲娱拣 37 | 桐铃己苹搬呆泥甜羡碎林贺织岗毯蜓涌碰燃家铜矩献解早下没斧驴秘验态 38 | 和墓馒睁塑疑计雁哥隐盲暮闸砖凑淋斜控徒末及慢货 39 | 辨阀盾嘉逼懂么惰长丽递扑屠捉袍休臣登余暑殊届茄臭善升农八棋征敢躲 40 | 增顾友汗公歇马旋铸波绍泰悄军骑赔娘饱彼爬绳法 41 | 踪就隶徐候味简士抽董空拔藏突散茧跨涛腾洪企肢颂璃刑恳米 42 | 需抱狱甩盈貌纽绸俩字谊膊亩借甚垦愈脱枪节造辆锯烦鞠缓职压女骆毒罐悟如餐轻 43 | 限枯残题修楼乎脉卷叮呀卡箩世役便雅健柱漏脏蓄打鹊鞋摸太拨壳歌存暴宏枝兽 44 | 稻恨写朋偶坡吉聋治封们艳气互洁绕巨商民蛋忌手局耽添执可纪丛逆罩粱施款 45 | 丙备负狭柿滨郑移汇氏齿寸横踩堆除粪所捞扬尽喂煤你阔惧崖围分型扒例 46 | 填叛三档昌夺汤珠兵孤派今狂抓变浆塌录画共笑毙深序桨冲采牢托签主膀让绣欲席装忽谣慨惩 47 | 先砌腹容棚悠孕奶二址捆邻关躺岔少帽狠厦楚拌吞梦战炭吸四走贼化周术窗腰渴十参奇莫穴爷防 48 | 克诚却享椅刻端锣脊库辟情迅戴蛮缸桃匀隔抖灌鹰朱丁缎队感讲境细晨裕详 49 | 粘财幕浴列忍申杠弄亡又昂烂式悉栽考势倍妹披萍 50 | 乡愉牧混赚蜻萝完组套挠障诞船李皆闻场送洽慎俗点锋任咳猎养赴怖 51 | 胀盖框霉树轮会源层欧男稼钻抬恼渗蜜冤隆肌通肝沙陪连 52 | 矛错桑歪部虑度胜泉加川饼拴茅咸掀煌颤滩寇英掠距壮际过饿启蛙汉眉稍史湖疲判滔号 53 | 院龙失君白郊嚷厚苍幼营皂窜亚聪跪恭盆黑速逐域应尼倚转窑许也骡缺 54 | 接春毛雀置破晒若催注轿安租睬喘耳荷纷岛钟亿途哗朗伶游鼓疼卜袖 55 | 撕料净服理名给叼粉博辛坊经返到岁足冻钢锤炸鉴或蜡有灯削竟偷疗义僵杆拖元鲁拜柳巡丢蕉 56 | 哀比预胳捎吨症希东梳假玻罢受荒凡姓千虏鸡责创熊刘筑宾视焰俯琴 57 | 巩缘呜以墨牵洋网灿急观惹炊起搞瓜淘慌该斗壤饮 58 | 认骂谨唯烘窄调径侦侍府陵断鄙宪冷荐形诊遣峡孔邀菜漫额刺愚诸汽界抹删诉是 59 | 头幅摩柴座岩遥获剖复偏疆警觉示角须翻避仇臂讽厌屯小乃蚕购京棕桂浇吃办镜抚 60 | 培戒检坦答厨猛河涨掉咽损推剃帝掏牙踏勤译业日亮茫扛在饥玩好武衣制每乔招析疫孩显瞧 61 | 敲锄扭读夹锅别卧大娇携象乒季抗誉纠动纺惯棒浮怎竹亦睛同强哭膛糟拆垫忙难 62 | 儿迫自灵丘已娃路患研人邪惊月乏这耐饰锈随逮阴归 63 | 医尿扁舒异浪欢辰辱鲜包迹眠兔累阿诗袄丹雾泡割慈缴权乘请矮飞客格摄票傍梨伪陶告 64 | 沉蠢荡寻酒续景舰颜弓钱挥碌报牺肩兰漆病熟悔北 65 | 去荣拾球委训捷育展循照艘访他豪竖捕俱帐械付摊旷团驼因稀惑 66 | 螺册盏绒染筐汪敞伤剩阅蒜用青贱鼻把扮允上闹般滤满袋 67 | 脂教食握挽凉凯腔秀明晕泛朴绝虾啦撤耀段搜坑了燕泊拥年选密称堵 68 | 决初何旨烧首寒旺脾涂指识误微厉命慰园作脖予刃按蝇勺相特铁碧未专恋芹馅夫右挖冠鱼良员淹 69 | 吹胁跃鸦丸张沿幻亲门帅碍紫膨滑吴菊适碑妇岂符锦颗漠终盒疮摘国裂缠糕 70 | 宫具海垄堡绑平弹祝猫管宋膏做鞭股遭爸糠资禽代奏毫状谅极联笔痛只纵唉撞严匆蛛羊持撒 71 | 得苦志必书伏旦旅揉秋芽息于切现七姻饭光停谁悲怕眼笛历承还澡敬倾糊哈类条激 72 | 彩妻为龟祖瞎链些立朵盛骨啊郎叹页拳穷萄黄炎载挑 73 | 妥畏猾躁知江麻桶债多蹲率音剧万魄絮星叨央彻真迷袜壁践衔肆非串厂流销六 74 | 利嫩伙供收远纯确带耻巴总投道犬溜疏翼居雷城斩霞宙田亭对耍齐迟卸背逗浩贫内仆坝 75 | 瑞刚锁仍划机消危评宰崇表侵侮困赌欠寨劣筛吵夸祸尤庭梯锻范胞宗来拍斥尾绞释肥操沈帘挡底 76 | 歼嘱肤弦偿念锡顶差党贸伴梁努零更废恒慕艇屑挺猴瓣骗超桌吼磨悦雹挤房仔撑弟押 77 | 富保裙不婶碗处肃肿兴次炉贴试谈税础仙基蒙慧稳丝哪湿促据 78 | 勿庄俊饲谋宵竿援均戏攀矿牛价固私担言炮测师顽弯论虽低由追栋跟黎束排凭 79 | 蹈滥普辫剂拒蛾醉阵顿扯铺扫且胖退烛醒述姨晴罚勒温湾惕顷换蓬尖扰狐集即陡菌朝露果 80 | 痰帮品概忠蓝前姐禾疾使壶成凝常从逝焦杨梢纹怒寄遵夏趁留革丰透擦寺地婆尚班然尺 81 | 泄赠窝能肯卫片饺颠贩筋车双辈覆父旬轰既皮遇开棉搅香恰询润附耗神傅谱挎雄 82 | 榨学乖怠您拼祥体垃忧敌运鼠针肺耕胃殖仗宴木美墙扔泳绘恶赖扣杰塞储垂妙 83 | -------------------------------------------------------------------------------- /www.dianping.com/src/main/java/com/github/wycm/DianpingCrawler.java: -------------------------------------------------------------------------------- 1 | package com.github.wycm; 2 | 3 | import org.apache.http.client.methods.HttpGet; 4 | import org.apache.http.impl.client.CloseableHttpClient; 5 | import org.apache.http.impl.client.HttpClients; 6 | import org.apache.http.util.EntityUtils; 7 | import org.jsoup.Jsoup; 8 | import org.jsoup.nodes.Document; 9 | import org.jsoup.nodes.Element; 10 | 11 | import java.io.IOException; 12 | import java.util.*; 13 | import java.util.regex.Matcher; 14 | import java.util.regex.Pattern; 15 | import java.util.stream.Collectors; 16 | 17 | /** 18 | * 美团点评字体反爬 19 | */ 20 | public class DianpingCrawler { 21 | public static void main(String[] args) throws IOException { 22 | getContent("http://www.dianping.com/shop/96231053"); 23 | } 24 | private static void getContent(String detailUrl) throws IOException { 25 | CloseableHttpClient httpClient = HttpClients 26 | .custom() 27 | .setUserAgent("Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.102 Safari/537.36") 28 | .build(); 29 | String originalContent = EntityUtils.toString(httpClient.execute(new HttpGet(detailUrl)).getEntity()); 30 | Document document= Jsoup.parse(originalContent); 31 | String cssUrl = "http:" + document.select("link[href*=svgtextcss]").first().attr("href"); 32 | String cssResponse = Jsoup.connect(cssUrl).execute().body(); 33 | // System.out.println(cssResponse); 34 | Pattern pattern = Pattern.compile("class\\^=\"(.*?)\".*?url\\((.*?)\\)"); 35 | Matcher matcher = pattern.matcher(cssResponse); 36 | Map urlMap = new HashMap<>(); 37 | Map svgMap = new HashMap<>(); 38 | while (matcher.find()){ 39 | String prefix = matcher.group(1); 40 | String url = "http:" + matcher.group(2); 41 | urlMap.put(prefix, url); 42 | svgMap.put(prefix, EntityUtils.toString(httpClient.execute(new HttpGet(url)).getEntity(), "utf-8")); 43 | System.out.println(prefix); 44 | System.out.println(url); 45 | } 46 | pattern = Pattern.compile("\\.[a-z]{2}-.*?\\{.*?\\}"); 47 | matcher = pattern.matcher(cssResponse); 48 | List cssList = new ArrayList<>(); 49 | Pattern cssBackgroundPattern = Pattern.compile("(\\.([a-z]{2})-.*?)\\{background:(.*?)\\.0px (.*?)\\.0px"); 50 | Matcher cssBackgroundMatch; 51 | while (matcher.find()){ 52 | cssBackgroundMatch = cssBackgroundPattern.matcher(matcher.group(0)); 53 | if (cssBackgroundMatch.find()){ 54 | cssList.add(new CssBackground(cssBackgroundMatch.group(1), Integer.valueOf(cssBackgroundMatch.group(3)), Integer.valueOf(cssBackgroundMatch.group(4)))); 55 | } 56 | } 57 | //对css分组排序 58 | cssList.sort((c1, c2) ->{ 59 | int i = c1.getClassName().substring(0, 3).compareTo(c2.getClassName().substring(0, 3)); 60 | if (i != 0){ 61 | return i; 62 | } else { 63 | i = c2.getY().compareTo(c1.getY());; 64 | if (i != 0){ 65 | return i; 66 | } else { 67 | return c2.getX().compareTo(c1.getX()); 68 | } 69 | } 70 | }); 71 | cssList.forEach(System.out::println); 72 | int xIndex = 0; 73 | int yIndex = 0; 74 | CssBackground lastCssBackground = null; 75 | //计算对应字体的坐标 76 | for(CssBackground c : cssList){ 77 | if (lastCssBackground == null){ 78 | lastCssBackground = c; 79 | continue; 80 | } else { 81 | if (!c.getClassName().substring(0, 3).equals(lastCssBackground.getClassName().substring(0, 3))){ 82 | xIndex = 0; 83 | yIndex = 0; 84 | } else if (!c.getX().equals(lastCssBackground.getX()) && c.getY().equals(lastCssBackground.getY())){ 85 | c.setxIndex(++xIndex); 86 | c.setyIndex(yIndex); 87 | } else if (c.getX().equals(lastCssBackground.getX()) && !c.getY().equals(lastCssBackground.getY())){ 88 | c.setxIndex(xIndex); 89 | c.setyIndex(++yIndex); 90 | } else if (!c.getX().equals(lastCssBackground.getX()) && !c.getY().equals(lastCssBackground.getY())){ 91 | xIndex = 0; 92 | c.setxIndex(xIndex); 93 | c.setyIndex(++yIndex); 94 | } 95 | lastCssBackground = c; 96 | } 97 | } 98 | Map cacheDocumentMap = new HashMap<>(); 99 | Map cssBackgroundMap = new HashMap<>(); 100 | String lastPrefix = ""; 101 | cssList.stream().map(c -> { 102 | c.setSvgResponse(svgMap.get(c.getClassName().substring(1, 4))); 103 | if (!cacheDocumentMap.containsKey(c.getClassName().substring(0, 3))){ 104 | cacheDocumentMap.put(c.getClassName().substring(0, 3), Jsoup.parse(c.getSvgResponse())); 105 | } 106 | c.setDocument(cacheDocumentMap.get(c.getClassName().substring(0, 3))); 107 | Document doc = c.getDocument(); 108 | Element e = null; 109 | if ((c.getX() == -6 && c.getY() == -6) || (c.getX() % -12 == -7 && c.getY() == -6)){ 110 | e = doc.select("text").first(); 111 | } else if ((c.getX() == -7 && c.getY() == -7) || (c.getX() % 14 == -8 && c.getY() == -7)){ 112 | e = doc.select("text").first(); 113 | } else if (c.getX() % 6 == -1 && c.getY() == -6){ 114 | e = doc.select("text").first(); 115 | } else if (c.getX() % -12 == 0 && c.getY() % -30 == -6){ 116 | e = doc.select("textPath[xlink:href='#" + (c.getyIndex() + 1) + "']").first(); 117 | } else if (c.getX() % -14 == 0 && c.getY() % -30 == -7){ 118 | e = doc.select("textPath[xlink:href='#" + (c.getyIndex() + 1) + "']").first(); 119 | } 120 | if (c == null){ 121 | //为上一个 122 | //CssBackground{className='.hy-GijB', x=-7, y=-6, xIndex=0, yIndex=0, actualFont='null'} 123 | //CssBackground{className='.hy-o8Bu', x=-19, y=-6, xIndex=0, yIndex=0, actualFont='null'} 124 | //CssBackground{className='.hy-7IxC', x=-31, y=-6, xIndex=0, yIndex=0, actualFont='null'} 125 | //CssBackground{className='.hy-8zQE', x=-43, y=-6, xIndex=0, yIndex=0, actualFont='null'} 126 | //CssBackground{className='.hy-PrgG', x=-55, y=-6, xIndex=0, yIndex=0, actualFont='null'} 127 | //CssBackground{className='.hy-Qbc8', x=-67, y=-6, xIndex=0, yIndex=0, actualFont='null'} 128 | //CssBackground{className='.hy-TnVD', x=-79, y=-6, xIndex=0, yIndex=0, actualFont='null'} 129 | //CssBackground{className='.hy-TqUO', x=-91, y=-6, xIndex=0, yIndex=0, actualFont='null'} 130 | //CssBackground{className='.hy-UkCG', x=-103, y=-6, xIndex=0, yIndex=0, actualFont='null'} 131 | //CssBackground{className='.hy-yOPP', x=-114, y=-6, xIndex=0, yIndex=0, actualFont='null'} 132 | //todo最后一个不满足规则 133 | } 134 | String text = e.text(); 135 | c.setActualFont(text.substring(c.getxIndex(), c.getxIndex() + 1)); 136 | cssBackgroundMap.put(c.getClassName().substring(1, c.getClassName().length()), c); 137 | return c; 138 | }).collect(Collectors.toList()); 139 | //还原网页 140 | Pattern spanPattern = Pattern.compile(""); 141 | Matcher contentMatcher = spanPattern.matcher(originalContent); 142 | while (contentMatcher.find()){ 143 | String s1 = contentMatcher.group(0); 144 | String s2 = cssBackgroundMap.get(contentMatcher.group(1)).getActualFont(); 145 | originalContent = originalContent.replace(s1, s2); 146 | } 147 | System.out.println(originalContent); 148 | } 149 | static class CssBackground{ 150 | private String className; 151 | private Integer x; 152 | private Integer y; 153 | private int xIndex; 154 | private int yIndex; 155 | private String svgResponse; 156 | private String actualFont; 157 | private Document document; 158 | 159 | public CssBackground(String className, int x, int y) { 160 | this.className = className; 161 | this.x = x; 162 | this.y = y; 163 | } 164 | 165 | public String getClassName() { 166 | return className; 167 | } 168 | 169 | public void setClassName(String className) { 170 | this.className = className; 171 | } 172 | 173 | public Integer getX() { 174 | return x; 175 | } 176 | 177 | public void setX(Integer x) { 178 | this.x = x; 179 | } 180 | 181 | public Integer getY() { 182 | return y; 183 | } 184 | 185 | public void setY(Integer y) { 186 | this.y = y; 187 | } 188 | 189 | public int getxIndex() { 190 | return xIndex; 191 | } 192 | 193 | public void setxIndex(int xIndex) { 194 | this.xIndex = xIndex; 195 | } 196 | 197 | public int getyIndex() { 198 | return yIndex; 199 | } 200 | 201 | public void setyIndex(int yIndex) { 202 | this.yIndex = yIndex; 203 | } 204 | 205 | public String getSvgResponse() { 206 | return svgResponse; 207 | } 208 | 209 | public void setSvgResponse(String svgResponse) { 210 | this.svgResponse = svgResponse; 211 | } 212 | 213 | public String getActualFont() { 214 | return actualFont; 215 | } 216 | 217 | public void setActualFont(String actualFont) { 218 | this.actualFont = actualFont; 219 | } 220 | 221 | public Document getDocument() { 222 | return document; 223 | } 224 | 225 | public void setDocument(Document document) { 226 | this.document = document; 227 | } 228 | 229 | @Override 230 | public String toString() { 231 | return "CssBackground{" + 232 | "className='" + className + '\'' + 233 | ", x=" + x + 234 | ", y=" + y + 235 | ", xIndex=" + xIndex + 236 | ", yIndex=" + yIndex + 237 | ", actualFont='" + actualFont + '\'' + 238 | '}'; 239 | } 240 | } 241 | } --------------------------------------------------------------------------------