├── README.md
└── xxx-spider
    ├── .gitignore
    ├── pom.xml
    └── src
        ├── main
            ├── java
            │   └── com
            │   │   └── zsf
            │   │       └── xxx
            │   │           ├── AbstractCrawler.java
            │   │           ├── Crawler.java
            │   │           ├── Launcher.java
            │   │           ├── PornhubCrawler.java
            │   │           ├── XvideosCrawler.java
            │   │           ├── http
            │   │               ├── AbstractHttpViewer.java
            │   │               ├── CorsAnywhereViewer.java
            │   │               ├── HttpDebugerViewer.java
            │   │               ├── HttpViewer.java
            │   │               └── JsonpAfeldViewer.java
            │   │           └── util
            │   │               ├── BatchRecursiveAction.java
            │   │               ├── BatchRecursiveTask.java
            │   │               ├── DownloadBatchRecursiveTask.java
            │   │               ├── FileInfo.java
            │   │               ├── FileItemInfo.java
            │   │               ├── HttpDownloader.java
            │   │               └── ParallelComputeUtil.java
            └── resources
            │   ├── LICENSE
            │   └── log4j2.xml
        └── test
            └── java
                └── com
                    └── zsf
                        └── xxx
                            └── JunitTest.java


/README.md:
--------------------------------------------------------------------------------
1 | # xxx
2 | 不用梯子获取pornhub  xvideos的视频地址。获取到地址后可直接用迅雷批量下载视频文件，也可用VLC在线播放视频文件。
3 | 


--------------------------------------------------------------------------------
/xxx-spider/.gitignore:
--------------------------------------------------------------------------------
1 | /target/
2 | /.classpath
3 | /.project
4 | /.settings/
5 | 


--------------------------------------------------------------------------------
/xxx-spider/pom.xml:
--------------------------------------------------------------------------------
 1 | <project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
 2 | 	xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
 3 | 	<modelVersion>4.0.0</modelVersion>
 4 | 	<groupId>com.zsf.xxx</groupId>
 5 | 	<artifactId>xxx-spider</artifactId>
 6 | 	<version>1.0.0</version>
 7 | 	<dependencies>
 8 | 		<dependency>
 9 | 			<groupId>commons-io</groupId>
10 | 			<artifactId>commons-io</artifactId>
11 | 			<version>2.5</version>
12 | 		</dependency>
13 | 		<dependency>
14 | 			<groupId>org.apache.commons</groupId>
15 | 			<artifactId>commons-lang3</artifactId>
16 | 			<version>3.9</version>
17 | 		</dependency>
18 | 		<dependency>
19 | 			<groupId>org.apache.commons</groupId>
20 | 			<artifactId>commons-text</artifactId>
21 | 			<version>1.8</version>
22 | 		</dependency>
23 | 		<dependency>
24 | 			<groupId>com.squareup.okhttp3</groupId>
25 | 			<artifactId>okhttp</artifactId>
26 | 			<version>4.1.0</version>
27 | 		</dependency>
28 | 		<dependency>
29 | 			<groupId>org.jsoup</groupId>
30 | 			<artifactId>jsoup</artifactId>
31 | 			<version>1.12.1</version>
32 | 		</dependency>
33 | 		<dependency>
34 | 			<groupId>org.apache.logging.log4j</groupId>
35 | 			<artifactId>log4j-slf4j-impl</artifactId>
36 | 			<version>2.12.1</version>
37 | 		</dependency>
38 | 		<dependency>
39 | 			<groupId>com.alibaba</groupId>
40 | 			<artifactId>fastjson</artifactId>
41 | 			<version>1.2.60</version>
42 | 		</dependency>
43 | 		<dependency>
44 | 			<groupId>junit</groupId>
45 | 			<artifactId>junit</artifactId>
46 | 			<version>4.12</version>
47 | 			<scope>test</scope>
48 | 		</dependency>
49 | 		<dependency>
50 | 			<groupId>org.projectlombok</groupId>
51 | 			<artifactId>lombok</artifactId>
52 | 			<version>1.18.8</version>
53 | 			<scope>provided</scope>
54 | 		</dependency>
55 | 		<dependency>
56 | 			<groupId>commons-codec</groupId>
57 | 			<artifactId>commons-codec</artifactId>
58 | 			<version>1.13</version>
59 | 		</dependency>
60 | 		<dependency>
61 | 			<groupId>org.apache.tika</groupId>
62 | 			<artifactId>tika-parsers</artifactId>
63 | 			<version>1.22</version>
64 | 		</dependency>
65 | 	</dependencies>
66 | 	<build>
67 | 		<plugins>
68 | 			<plugin>
69 | 				<groupId>org.apache.maven.plugins</groupId>
70 | 				<artifactId>maven-assembly-plugin</artifactId>
71 | 				<version>3.1.1</version>
72 | 				<configuration>
73 | 					<archive>
74 | 						<manifest>
75 | 							<addClasspath>true</addClasspath>
76 | 							<mainClass>com.zsf.xxx.Launcher</mainClass>
77 | 						</manifest>
78 | 						<compress>true</compress>
79 | 					</archive>
80 | 					<descriptorRefs>
81 | 						<descriptorRef>jar-with-dependencies</descriptorRef>
82 | 					</descriptorRefs>
83 | 					<encoding>UTF-8</encoding>
84 | 				</configuration>
85 | 				<executions>
86 | 					<execution>
87 | 						<id>assemble-all</id>
88 | 						<phase>package</phase>
89 | 						<goals>
90 | 							<goal>single</goal>
91 | 						</goals>
92 | 					</execution>
93 | 				</executions>
94 | 			</plugin>
95 | 		</plugins>
96 | 	</build>
97 | </project>


--------------------------------------------------------------------------------
/xxx-spider/src/main/java/com/zsf/xxx/AbstractCrawler.java:
--------------------------------------------------------------------------------
 1 | package com.zsf.xxx;
 2 | 
 3 | import java.io.File;
 4 | import java.io.IOException;
 5 | import java.util.ArrayList;
 6 | import java.util.List;
 7 | import java.util.Map;
 8 | 
 9 | import org.apache.commons.io.FileUtils;
10 | import org.apache.commons.lang3.StringUtils;
11 | import org.slf4j.Logger;
12 | import org.slf4j.LoggerFactory;
13 | 
14 | /**
15 |  * @author papapa
16 |  *
17 |  */
18 | public abstract class AbstractCrawler implements Crawler{
19 | 
20 | 	private static final Logger log = LoggerFactory.getLogger(AbstractCrawler.class);
21 | 
22 | 	@Override
23 | 	public void execute(String dir) throws IOException {
24 | 		if (StringUtils.isBlank(dir)) {
25 | 			throw new IOException("文件保存目录为空");
26 | 		}
27 | 		Map<String, String> categories = getCategories();
28 | 		if (categories != null) {
29 | 			for (Map.Entry<String, String> entry : categories.entrySet()) {
30 | 				String title = entry.getKey();
31 | 				String href = entry.getValue();
32 | 				List<String> viewUrls = getViewUrls(href);
33 | 				log.info("获取所有[{}]播放地址成功:{}", title, viewUrls);
34 | 				List<String> videoUrls = getVideoUrls(viewUrls);
35 | 				log.info("获取所有[{}]视频地址成功:{}", title, videoUrls);
36 | 
37 | 				String downloadFileName = dir + File.separatorChar + title + File.separatorChar + "download" + ".txt";
38 | 				FileUtils.writeLines(new File(downloadFileName),"UTF-8", videoUrls, false);
39 | 				log.info("[{}]视频链接保存在{}",title,downloadFileName);
40 | 				
41 | 				log.info("开始下载[{}]下的视频",title);
42 | 				downloadVideos(dir + File.separatorChar + title, videoUrls);
43 | 				log.info("结束下载[{}]下的视频",title);
44 | 			}
45 | 		}
46 | 	}
47 | 
48 | 	public void downloadVideos(String dir,List<String> videoUrls){
49 | 		for(String videoUrl : videoUrls){
50 | 			downloadVideo(dir,videoUrl);				
51 | 		}
52 | 	}
53 | 	
54 | 	public List<String> getVideoUrls(List<String> viewUrls) {
55 | 		List<String> videoUrls = new ArrayList<>();
56 | 		if (viewUrls != null && viewUrls.size() > 0) {
57 | 			for (String viewUrl : viewUrls) {
58 | 				String videoUrl = getVideoUrl(viewUrl);
59 | 				if(StringUtils.isNotBlank(videoUrl)){
60 | 					videoUrls.add(videoUrl);
61 | 				}
62 | 			}
63 | 		}
64 | 		return videoUrls;
65 | 	}
66 | }
67 | 


--------------------------------------------------------------------------------
/xxx-spider/src/main/java/com/zsf/xxx/Crawler.java:
--------------------------------------------------------------------------------
 1 | package com.zsf.xxx;
 2 | 
 3 | import java.io.IOException;
 4 | import java.util.List;
 5 | import java.util.Map;
 6 | 
 7 | /**
 8 |  * 
 9 |  * @author papapa
10 |  *
11 |  */
12 | public interface Crawler {
13 | 
14 | 	/**
15 | 	 * 爬虫入口
16 | 	 * 
17 | 	 * @param dir
18 | 	 *            文件保存目录
19 | 	 * @throws IOException
20 | 	 */
21 | 	void execute(String dir) throws IOException;
22 | 
23 | 	/**
24 | 	 * 获取分类集合
25 | 	 * 
26 | 	 * @return
27 | 	 */
28 | 	Map<String, String> getCategories();
29 | 
30 | 	/**
31 | 	 * 获取视频地址
32 | 	 * 
33 | 	 * @param href
34 | 	 * @return
35 | 	 */
36 | 	List<String> getViewUrls(String href);
37 | 
38 | 	/**
39 | 	 * 获取视频高清地址
40 | 	 * 
41 | 	 * @param viewUrl
42 | 	 * @return
43 | 	 * @throws IOException
44 | 	 */
45 | 	String getVideoUrl(String viewUrl);
46 | 
47 | 	/**
48 | 	 * 下载视频
49 | 	 * @param dir 文件保存目录
50 | 	 * @param videoUrl 视频地址
51 | 	 * @return
52 | 	 */
53 | 	boolean downloadVideo(String dir,String videoUrl);
54 | }
55 | 


--------------------------------------------------------------------------------
/xxx-spider/src/main/java/com/zsf/xxx/Launcher.java:
--------------------------------------------------------------------------------
 1 | package com.zsf.xxx;
 2 | 
 3 | import java.io.IOException;
 4 | import java.io.InputStream;
 5 | import java.nio.charset.Charset;
 6 | import java.util.List;
 7 | 
 8 | import org.apache.commons.io.FileUtils;
 9 | import org.apache.commons.io.IOUtils;
10 | import org.apache.commons.lang3.StringUtils;
11 | import org.slf4j.Logger;
12 | import org.slf4j.LoggerFactory;
13 | 
14 | /**
15 |  * @author papapa
16 |  *
17 |  */
18 | public class Launcher {
19 | 
20 | 	private static final Logger log = LoggerFactory.getLogger(Launcher.class);
21 | 
22 | 	public static void main(String[] args) throws IOException {
23 | 		InputStream in = Launcher.class.getResourceAsStream("/LICENSE");
24 | 		if(in != null){
25 | 			List<String> lines = IOUtils.readLines(in,Charset.forName("UTF-8"));
26 | 			log.info(StringUtils.join(lines,"\r\n"));			
27 | 		}
28 | 		log.info("爬虫开始");
29 | 		String fileDir = FileUtils.getTempDirectoryPath();//默认文件保存目录
30 | 		if(args != null && args.length > 0){
31 | 			fileDir = args[0];
32 | 		}
33 | 		log.info("视频地址文件保存在:{}",fileDir);
34 | 		//new PornhubCrawler().execute(fileDir);
35 | 		new XvideosCrawler().execute(fileDir);
36 | 		log.info("爬虫停止");
37 | 	}
38 | }
39 | 


--------------------------------------------------------------------------------
/xxx-spider/src/main/java/com/zsf/xxx/PornhubCrawler.java:
--------------------------------------------------------------------------------
  1 | package com.zsf.xxx;
  2 | 
  3 | import java.io.IOException;
  4 | import java.util.ArrayList;
  5 | import java.util.HashMap;
  6 | import java.util.List;
  7 | import java.util.Map;
  8 | 
  9 | import org.apache.commons.lang3.StringUtils;
 10 | import org.jsoup.nodes.Document;
 11 | import org.jsoup.nodes.Element;
 12 | import org.jsoup.select.Elements;
 13 | import org.slf4j.Logger;
 14 | import org.slf4j.LoggerFactory;
 15 | 
 16 | import com.alibaba.fastjson.JSONPath;
 17 | import com.zsf.xxx.http.HttpViewer;
 18 | 
 19 | /**
 20 |  * @author papapa
 21 |  *
 22 |  */
 23 | public class PornhubCrawler extends AbstractCrawler {
 24 | 
 25 | 	private static final Logger log = LoggerFactory.getLogger(PornhubCrawler.class);
 26 | 
 27 | 	private static final String BASE_URL = "https://cn.pornhub.com";// 中文pornhub
 28 | 
 29 | 	private static final String PARAMS = "hd=1";// 高清
 30 | 
 31 | 	/**
 32 | 	 * 下载视频
 33 | 	 * 
 34 | 	 * @param dir
 35 | 	 * @param videoUrl
 36 | 	 */
 37 | 	@Override
 38 | 	public boolean downloadVideo(String dir, String videoUrl) {
 39 | 		return false;
 40 | 	}
 41 | 
 42 | 	/**
 43 | 	 * 获取视频地址(720p mp4格式)
 44 | 	 * 
 45 | 	 * @param viewUrl
 46 | 	 * @return
 47 | 	 * @throws IOException
 48 | 	 */
 49 | 	@Override
 50 | 	public String getVideoUrl(String viewUrl) {
 51 | 		String videoUrl = null;
 52 | 		Document doc;
 53 | 		try {
 54 | 			doc = HttpViewer.getRandomInstance().getResponseDoc(viewUrl);
 55 | 			if (doc == null) {
 56 | 				return null;
 57 | 			}
 58 | 			String scriptStr = doc.selectFirst("div#player > script").toString();
 59 | 			scriptStr = "{" + StringUtils.substringBetween(scriptStr, "{", "};") + "}";
 60 | 
 61 | 			// 获取720p且为mp4格式的地址
 62 | 			videoUrl = (String) JSONPath.eval(scriptStr, "$.mediaDefinitions[quality='720'][format='mp4'][0].videoUrl");
 63 | 		} catch (IOException e) {
 64 | 			log.error("获取视频地址错误:[" + videoUrl + "]", e);
 65 | 		}
 66 | 
 67 | 		return videoUrl;
 68 | 	}
 69 | 
 70 | 	/**
 71 | 	 * 获取该分类地址下的所有播放地址
 72 | 	 * 
 73 | 	 * @param href
 74 | 	 * @return
 75 | 	 * @throws IOException
 76 | 	 */
 77 | 	@Override
 78 | 	public List<String> getViewUrls(String href) {
 79 | 		List<String> viewUrls = new ArrayList<>();
 80 | 		Document doc;
 81 | 		try {
 82 | 			doc = HttpViewer.getRandomInstance().getResponseDoc(href);
 83 | 			if (doc == null) {
 84 | 				return null;
 85 | 			}
 86 | 			List<String> pageUrls = getPageUrls(doc);
 87 | 			if (pageUrls != null) {
 88 | 				viewUrls.addAll(pageUrls);
 89 | 			}
 90 | 
 91 | 			Element nextPageElement = doc.selectFirst("div.pagination3 > ul > li.page_next > a");
 92 | 			if (nextPageElement != null) {
 93 | 				String nextPageHref = BASE_URL + "/" + nextPageElement.attr("href");// 下一页地址
 94 | 				if (nextPageHref.endsWith("page=2")) {// 为了测试只获取1页数据
 95 | 					return viewUrls;
 96 | 				}
 97 | 				viewUrls.addAll(getViewUrls(nextPageHref));// 递规获取每一页中的播放地址
 98 | 			}
 99 | 		} catch (IOException e) {
100 | 			log.error("获取文档对象错误:[" + href + "]", e);
101 | 		}
102 | 
103 | 		return viewUrls;
104 | 	}
105 | 
106 | 	/**
107 | 	 * 获取当前页中每个的播放地址
108 | 	 * 
109 | 	 * @param doc
110 | 	 * @return
111 | 	 */
112 | 	public List<String> getPageUrls(Document doc) {
113 | 		List<String> urls = null;
114 | 		Elements lis = doc.select("ul#videoCategory > li.js-pop");
115 | 		if (lis != null && lis.size() > 0) {
116 | 			urls = new ArrayList<>();
117 | 			for (Element li : lis) {
118 | 				urls.add(BASE_URL + "/view_video.php?viewkey=" + li.attr("_vkey"));
119 | 			}
120 | 		}
121 | 		return urls;
122 | 	}
123 | 
124 | 	/**
125 | 	 * 获取分类集合
126 | 	 * 
127 | 	 * @return
128 | 	 * @throws IOException
129 | 	 */
130 | 	@Override
131 | 	public Map<String, String> getCategories() {
132 | 		Map<String, String> result = null;
133 | 		Document doc;
134 | 		try {
135 | 			doc = HttpViewer.getRandomInstance().getResponseDoc(BASE_URL + "/categories");
136 | 			if (doc == null) {
137 | 				return null;
138 | 			}
139 | 			result = new HashMap<>();
140 | 			Elements lis = doc.select("ul#categoriesListSection > li");
141 | 			for (Element li : lis) {
142 | 				String href = BASE_URL + li.selectFirst("div.category-wrapper > a").attr("href");
143 | 				if (href.contains("?")) {// 只获取高清视频
144 | 					href += "&" + PARAMS;
145 | 				} else {
146 | 					href += "?" + PARAMS;
147 | 				}
148 | 				String title = li.selectFirst("div.category-wrapper > h5 > a").attr("data-mxptext");
149 | 				result.put(title, href);
150 | 				log.info("分类:{},地址:{}", title, href);
151 | 				if (result.size() > 2)
152 | 					break;// 只获取3个
153 | 			}
154 | 		} catch (IOException e) {
155 | 			log.error("获取分类错误:", e);
156 | 		}
157 | 		return result;
158 | 	}
159 | }
160 | 


--------------------------------------------------------------------------------
/xxx-spider/src/main/java/com/zsf/xxx/XvideosCrawler.java:
--------------------------------------------------------------------------------
  1 | package com.zsf.xxx;
  2 | 
  3 | import java.io.File;
  4 | import java.io.IOException;
  5 | import java.util.ArrayList;
  6 | import java.util.HashMap;
  7 | import java.util.List;
  8 | import java.util.Map;
  9 | 
 10 | import org.apache.commons.lang3.StringUtils;
 11 | import org.jsoup.nodes.Document;
 12 | import org.jsoup.nodes.Element;
 13 | import org.jsoup.select.Elements;
 14 | import org.slf4j.Logger;
 15 | import org.slf4j.LoggerFactory;
 16 | 
 17 | import com.zsf.xxx.http.HttpViewer;
 18 | import com.zsf.xxx.util.HttpDownloader;
 19 | 
 20 | /**
 21 |  * @author papapa
 22 |  *
 23 |  */
 24 | public class XvideosCrawler extends AbstractCrawler{
 25 | 
 26 | 	private static final Logger log = LoggerFactory.getLogger(XvideosCrawler.class);
 27 | 
 28 | 	private static final String BASE_URL = "https://www.xvideos.com";
 29 | 
 30 | 	/**
 31 | 	 * 获取视频高清地址
 32 | 	 * 
 33 | 	 * @param viewUrl
 34 | 	 * @return
 35 | 	 * @throws IOException
 36 | 	 */
 37 | 	@Override
 38 | 	public String getVideoUrl(String viewUrl) {
 39 | 		String videoUrl = null;
 40 | 		Document doc;
 41 | 		try {
 42 | 			doc = HttpViewer.getRandomInstance().getResponseDoc(viewUrl);
 43 | 			if (doc == null) {
 44 | 				return null;
 45 | 			}
 46 | 			Elements scripts = doc.getElementsByTag("script");
 47 | 			if (scripts == null || scripts.size() == 0) {
 48 | 				return null;
 49 | 			}
 50 | 			for (Element script : scripts) {
 51 | 				String scriptStr = script.html();
 52 | 				if (StringUtils.contains(scriptStr, "html5player.setVideoUrlHigh('")) {
 53 | 					videoUrl = StringUtils.substringBetween(scriptStr, "html5player.setVideoUrlHigh('", "');");
 54 | 					break;
 55 | 				}
 56 | 			}
 57 | 		} catch (IOException e) {
 58 | 			log.error("获取视频地址错误:[" + viewUrl + "]", e);
 59 | 		}
 60 | 
 61 | 		return videoUrl;
 62 | 	}
 63 | 
 64 | 	/**
 65 | 	 * 获取该分类地址下的所有播放地址
 66 | 	 * 
 67 | 	 * @param href
 68 | 	 * @return
 69 | 	 * @throws IOException
 70 | 	 */
 71 | 	@Override
 72 | 	public List<String> getViewUrls(String href) {
 73 | 		List<String> viewUrls = new ArrayList<>();
 74 | 		Document doc;
 75 | 		try {
 76 | 			doc = HttpViewer.getRandomInstance().getResponseDoc(href);
 77 | 			if (doc == null) {
 78 | 				return null;
 79 | 			}
 80 | 			List<String> pageUrls = getPageUrls(doc);
 81 | 			if (pageUrls != null) {
 82 | 				viewUrls.addAll(pageUrls);
 83 | 			}
 84 | 
 85 | 			Element nextPageElement = doc.selectFirst("div.pagination > ul > li > a.next-page");
 86 | 			if (nextPageElement != null) {
 87 | 				String nextPageHref = BASE_URL + nextPageElement.attr("href");// 下一页地址
 88 | 				if (nextPageHref.contains("/3") || nextPageHref.contains("p=3")) {// 为了测试只获取2页数据
 89 | 					return viewUrls;
 90 | 				}
 91 | 				viewUrls.addAll(getViewUrls(nextPageHref));// 递规获取每一页中的播放地址
 92 | 			}
 93 | 		} catch (IOException e) {
 94 | 			log.error("获取文档对象错误:[" + href + "]", e);
 95 | 		}
 96 | 
 97 | 		return viewUrls;
 98 | 	}
 99 | 
100 | 	/**
101 | 	 * 获取当前页中每个的播放地址
102 | 	 * 
103 | 	 * @param doc
104 | 	 * @return
105 | 	 */
106 | 	public List<String> getPageUrls(Document doc) {
107 | 		//Elements links = doc.select("div#content > div.mozaique > div div.thumb > a");
108 | 		Elements insides = doc.select("div#content > div.mozaique > div div.thumb-inside");
109 | 		if(insides == null || insides.size() == 0){
110 | 			return null;
111 | 		}
112 | 		List<String> urls = new ArrayList<>();
113 | 		for(Element inside : insides){
114 | 			Element hdElement = inside.selectFirst("span.video-hd-mark");//获取高清(720p)视频
115 | 			if(hdElement != null){
116 | 				Element linkElement = inside.selectFirst("div.thumb > a");
117 | 				String link = linkElement.attr("href");
118 | 				urls.add(BASE_URL + link);
119 | 			}
120 | 		}
121 | 		return urls;
122 | 	}
123 | 
124 | 	@Override
125 | 	public Map<String, String> getCategories() {
126 | 		Map<String,String> categories = new HashMap<>();
127 | 		categories.put("丝袜",BASE_URL+"/c/Stockings-28");
128 | 		categories.put("喷水",BASE_URL+"/c/Squirting-56");
129 | 		categories.put("女同",BASE_URL+"/?k=lesbian&quality=hd");
130 | 
131 | 		return categories;
132 | 	}
133 | 
134 | 	@Override
135 | 	public boolean downloadVideo(String dir, String videoUrl) {
136 | 		String fileName = StringUtils.substringAfterLast(videoUrl,"/");
137 | 		fileName = StringUtils.substringBefore(fileName, "?");
138 | 		String filePath = dir+File.separatorChar+fileName;
139 | 		try {
140 | 			new HttpDownloader().download(videoUrl, filePath);
141 | 			return true;
142 | 		} catch (IOException e) {
143 | 			e.printStackTrace();
144 | 		}
145 | 		return false;
146 | 	}
147 | }
148 | 


--------------------------------------------------------------------------------
/xxx-spider/src/main/java/com/zsf/xxx/http/AbstractHttpViewer.java:
--------------------------------------------------------------------------------
 1 | package com.zsf.xxx.http;
 2 | 
 3 | import java.io.IOException;
 4 | import java.time.Duration;
 5 | 
 6 | import org.jsoup.nodes.Document;
 7 | 
 8 | import okhttp3.OkHttpClient;
 9 | 
10 | /**
11 |  * @author papapa
12 |  *
13 |  */
14 | public abstract class AbstractHttpViewer implements HttpViewer {
15 | 
16 | 	public static final String USER_AGENT = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.132 Safari/537.36";
17 | 	
18 | 	@Override
19 | 	public String getResponesStr(String url) throws IOException {
20 | 		Document doc = getResponseDoc(url);
21 | 		if(doc != null){
22 | 			return doc.html();
23 | 		}
24 | 		return null;
25 | 	}
26 | 	
27 | 	public OkHttpClient.Builder getOkHttpClientBuilder(){
28 | 		OkHttpClient.Builder builder = new OkHttpClient.Builder()
29 | 				.followRedirects(true)
30 | 				.followSslRedirects(true)
31 | 				.retryOnConnectionFailure(true)
32 | 				.readTimeout(Duration.ofSeconds(10L))
33 | 				.connectTimeout(Duration.ofSeconds(10L))
34 | 				.readTimeout(Duration.ofSeconds(10L))
35 | 				.callTimeout(Duration.ofSeconds(10L));
36 | 		return builder;
37 | 	}
38 | }
39 | 


--------------------------------------------------------------------------------
/xxx-spider/src/main/java/com/zsf/xxx/http/CorsAnywhereViewer.java:
--------------------------------------------------------------------------------
 1 | package com.zsf.xxx.http;
 2 | 
 3 | import java.io.IOException;
 4 | 
 5 | import org.jsoup.Jsoup;
 6 | import org.jsoup.nodes.Document;
 7 | import org.slf4j.Logger;
 8 | import org.slf4j.LoggerFactory;
 9 | 
10 | import okhttp3.Call;
11 | import okhttp3.OkHttpClient;
12 | import okhttp3.Request;
13 | import okhttp3.ResponseBody;
14 | 
15 | /**
16 |  * @author papapa
17 |  *
18 |  */
19 | public class CorsAnywhereViewer extends  AbstractHttpViewer {
20 | 
21 | 	private static final Logger log = LoggerFactory.getLogger(CorsAnywhereViewer.class);
22 | 
23 | 	private static final String PROXY_URL = "https://cors-anywhere.herokuapp.com/";
24 | 
25 | 	@Override
26 | 	public Document getResponseDoc(String url) throws IOException {
27 | 		log.info("获取页面:[{}]内容", url);
28 | 		OkHttpClient client = super.getOkHttpClientBuilder().build();
29 | 		Request request = new Request.Builder().url(PROXY_URL + url).header("x-requested-with", "XMLHttpRequest")
30 | 				.header("User-Agent", USER_AGENT).get().build();
31 | 		Call call = client.newCall(request);
32 | 		okhttp3.Response response = call.execute();
33 | 		int code = response.code();
34 | 		if (code == 200) {
35 | 			ResponseBody body = response.body();
36 | 			Document doc = Jsoup.parse(body.string());
37 | 			body.close();
38 | 			response.close();
39 | 			return doc;
40 | 		}
41 | 		return null;
42 | 	}
43 | 
44 | }
45 | 


--------------------------------------------------------------------------------
/xxx-spider/src/main/java/com/zsf/xxx/http/HttpDebugerViewer.java:
--------------------------------------------------------------------------------
 1 | package com.zsf.xxx.http;
 2 | 
 3 | import java.io.IOException;
 4 | import java.util.HashMap;
 5 | import java.util.Map;
 6 | 
 7 | import org.apache.commons.text.StringEscapeUtils;
 8 | import org.jsoup.Jsoup;
 9 | import org.jsoup.nodes.Document;
10 | import org.slf4j.Logger;
11 | import org.slf4j.LoggerFactory;
12 | 
13 | import okhttp3.Call;
14 | import okhttp3.FormBody;
15 | import okhttp3.Headers;
16 | import okhttp3.OkHttpClient;
17 | import okhttp3.Request;
18 | import okhttp3.RequestBody;
19 | import okhttp3.ResponseBody;
20 | 
21 | /**
22 |  * @author papapa
23 |  *
24 |  */
25 | public class HttpDebugerViewer extends AbstractHttpViewer {
26 | 
27 | 	private static final Logger log = LoggerFactory.getLogger(HttpDebugerViewer.class);
28 | 
29 | 	private static final String PROXY_URL = "http://www.httpdebugger.com/tools/ViewHttpHeaders.aspx";
30 | 
31 | 	@Override
32 | 	public Document getResponseDoc(String url) throws IOException {
33 | 		log.info("获取页面:[{}]内容", url);
34 | 		OkHttpClient client = super.getOkHttpClientBuilder().build();
35 | 		Request request = new Request.Builder().url(PROXY_URL).headers(Headers.of(getHeaders())).post(getRequestBody(url)).build();
36 | 		Call call = client.newCall(request);
37 | 		okhttp3.Response response = call.execute();
38 | 		int code = response.code();
39 | 		if (code == 200) {
40 | 			ResponseBody body = response.body();
41 | 			Document doc = Jsoup.parse(body.string());
42 | 			body.close();
43 | 			response.close();
44 | 			String html = doc.selectFirst("div#ResultData pre").html();
45 | 			html = StringEscapeUtils.unescapeHtml4(html);
46 | 			return Jsoup.parse(html);
47 | 		}
48 | 		return null;
49 | 	}
50 | 
51 | 	private RequestBody getRequestBody(String url){
52 | 		FormBody.Builder builder = new FormBody.Builder();
53 |         builder.add("UrlBox", url);
54 |         builder.add("AgentList", "Google Chrome");
55 |         builder.add("VersionsList", "HTTP/1.1");
56 |         builder.add("MethodList", "GET");
57 |         return builder.build();
58 | 	}
59 | 	
60 | 	private Map<String,String> getHeaders(){
61 | 		Map<String,String> headers = new HashMap<>();
62 | 		headers.put("Host", "www.httpdebugger.com");
63 | 		headers.put("Origin", "http://www.httpdebugger.com");
64 | 		headers.put("Pragma", "no-cache");
65 | 		headers.put("Referer", "http://www.httpdebugger.com/tools/ViewHttpHeaders.aspx");
66 | 		headers.put("Upgrade-Insecure-Requests", "1");
67 | 		return headers;
68 | 	}
69 | }
70 | 


--------------------------------------------------------------------------------
/xxx-spider/src/main/java/com/zsf/xxx/http/HttpViewer.java:
--------------------------------------------------------------------------------
 1 | package com.zsf.xxx.http;
 2 | 
 3 | import java.io.IOException;
 4 | 
 5 | import org.apache.commons.lang3.RandomUtils;
 6 | import org.jsoup.nodes.Document;
 7 | 
 8 | /**
 9 |  * @author papapa
10 |  *
11 |  */
12 | public interface HttpViewer {
13 | 
14 | 	/**
15 | 	 * 获取url响应后的html内容
16 | 	 * @param url
17 | 	 * @return
18 | 	 * @throws IOException
19 | 	 */
20 | 	String getResponesStr(String url) throws IOException;
21 | 	
22 | 	/**
23 | 	 * 获取url响应后的文档对像
24 | 	 * @param url
25 | 	 * @return
26 | 	 * @throws IOException
27 | 	 */
28 | 	Document getResponseDoc(String url) throws IOException;
29 | 	
30 | 	/**
31 | 	 * 获取httpViewer的一个随机实例
32 | 	 * @return
33 | 	 */
34 | 	public static HttpViewer getRandomInstance(){
35 | 		 HttpViewer[] VIEWERS = new HttpViewer[]{new CorsAnywhereViewer(),new HttpDebugerViewer(),new JsonpAfeldViewer()}; 
36 | 		int index = RandomUtils.nextInt(0, 2);
37 | 		return VIEWERS[index];
38 | 	}
39 | }
40 | 


--------------------------------------------------------------------------------
/xxx-spider/src/main/java/com/zsf/xxx/http/JsonpAfeldViewer.java:
--------------------------------------------------------------------------------
 1 | package com.zsf.xxx.http;
 2 | 
 3 | import java.io.IOException;
 4 | 
 5 | import org.jsoup.Jsoup;
 6 | import org.jsoup.nodes.Document;
 7 | import org.slf4j.Logger;
 8 | import org.slf4j.LoggerFactory;
 9 | 
10 | import okhttp3.Call;
11 | import okhttp3.OkHttpClient;
12 | import okhttp3.Request;
13 | import okhttp3.ResponseBody;
14 | 
15 | /**
16 |  * @author papapa
17 |  *
18 |  */
19 | public class JsonpAfeldViewer extends  AbstractHttpViewer {
20 | 
21 | 	private static final Logger log = LoggerFactory.getLogger(JsonpAfeldViewer.class);
22 | 
23 | 	private static final String PROXY_URL = "https://jsonp.afeld.me/?url=";
24 | 
25 | 	@Override
26 | 	public Document getResponseDoc(String url) throws IOException {
27 | 		log.info("获取页面:[{}]内容", url);
28 | 		OkHttpClient client = super.getOkHttpClientBuilder().build();
29 | 		Request request = new Request.Builder().url(PROXY_URL + url).header("User-Agent", USER_AGENT).get().build();
30 | 		Call call = client.newCall(request);
31 | 		okhttp3.Response response = call.execute();
32 | 		int code = response.code();
33 | 		if (code == 200) {
34 | 			ResponseBody body = response.body();
35 | 			Document doc = Jsoup.parse(body.string());
36 | 			body.close();
37 | 			response.close();
38 | 			return doc; 
39 | 		}
40 | 		return null;
41 | 	}
42 | 
43 | }
44 | 


--------------------------------------------------------------------------------
/xxx-spider/src/main/java/com/zsf/xxx/util/BatchRecursiveAction.java:
--------------------------------------------------------------------------------
 1 | package com.zsf.xxx.util;
 2 | 
 3 | import java.lang.reflect.Constructor;
 4 | import java.lang.reflect.InvocationTargetException;
 5 | import java.util.ArrayList;
 6 | import java.util.List;
 7 | import java.util.concurrent.RecursiveAction;
 8 | 
 9 | /**
10 |  * @author papapa
11 |  *
12 |  */
13 | @SuppressWarnings("rawtypes")
14 | public abstract class BatchRecursiveAction extends RecursiveAction{
15 | 
16 | 	private static final long serialVersionUID = -2909644333830555865L;
17 | 	
18 | 	private List items;
19 | 	
20 | 	private Object ext;
21 | 	
22 | 	protected BatchRecursiveAction(List items,Object ext){
23 | 		this.items = items;
24 | 		this.ext = ext;
25 | 	}
26 | 	
27 | 	@SuppressWarnings({ "unchecked" })
28 | 	@Override
29 | 	protected void compute() {
30 | 		if(items != null && items.size() > 0){
31 | 			if(items.size() == 1){
32 | 				computeItem(items.get(0));
33 | 			}else{
34 | 				List<BatchRecursiveAction> actions = new ArrayList<>();
35 | 				for(Object item : items){
36 | 					List subActions = new ArrayList<>();
37 | 					subActions.add(item);
38 | 					BatchRecursiveAction subBatchRecursiveAction = null;
39 | 					try {
40 | 						Constructor constructor = this.getClass().getDeclaredConstructors()[0];
41 | 						constructor.setAccessible(true);
42 | 						subBatchRecursiveAction = (BatchRecursiveAction) constructor.newInstance(subActions,this.ext);
43 | 						actions.add(subBatchRecursiveAction);
44 | 					} catch (InstantiationException | IllegalAccessException | IllegalArgumentException
45 | 							| InvocationTargetException | SecurityException e) {
46 | 						e.printStackTrace();
47 | 					}
48 | 				}
49 | 				invokeAll(actions);
50 | 				for(BatchRecursiveAction action : actions){
51 | 					action.join();
52 | 				}
53 | 			}
54 | 		}
55 | 	}
56 | 
57 | 	public abstract void computeItem(Object item);
58 | }
59 | 


--------------------------------------------------------------------------------
/xxx-spider/src/main/java/com/zsf/xxx/util/BatchRecursiveTask.java:
--------------------------------------------------------------------------------
 1 | package com.zsf.xxx.util;
 2 | 
 3 | import java.lang.reflect.Constructor;
 4 | import java.lang.reflect.InvocationTargetException;
 5 | import java.lang.reflect.Type;
 6 | import java.util.ArrayList;
 7 | import java.util.Arrays;
 8 | import java.util.List;
 9 | import java.util.concurrent.RecursiveTask;
10 | 
11 | /**
12 |  * @author papapa
13 |  *
14 |  */
15 | @SuppressWarnings("rawtypes")
16 | public abstract class BatchRecursiveTask extends RecursiveTask<List>{
17 | 
18 | 	private static final long serialVersionUID = 2119394771132854398L;
19 | 	
20 | 	private List items;
21 | 	
22 | 	private Object ext;
23 | 	
24 | 	protected BatchRecursiveTask(List items,Object ext){
25 | 		this.items = items;
26 | 		this.ext = ext;
27 | 	}
28 | 	
29 | 	public Object getExt(){
30 | 		return this.ext;
31 | 	}
32 | 	
33 | 	@SuppressWarnings({ "unchecked" })
34 | 	@Override
35 | 	protected List compute() {
36 | 		List values = new ArrayList<>();
37 | 		if(items != null && items.size() > 0){
38 | 			if(items.size() == 1){
39 | 				values.add(computeItem(items.get(0)));
40 | 			}else{
41 | 				BatchRecursiveTask[] tasks = new BatchRecursiveTask[items.size()];
42 | 				int index = 0;
43 | 				Constructor constructor = this.getClass().getDeclaredConstructors()[0];
44 | 				constructor.setAccessible(true);
45 | 				Type[] types = constructor.getGenericParameterTypes();//匿名内部内有多个参数
46 | 				Object[] initargs = new Object[types.length];
47 | 				for(Object item : items){
48 | 					try {
49 | 						initargs[types.length-1] = this.ext;
50 | 						initargs[types.length-2] = Arrays.asList(item);					
51 | 						tasks[index] = (BatchRecursiveTask) constructor.newInstance(initargs);
52 | 						index++;
53 | 					} catch (InstantiationException | IllegalAccessException | IllegalArgumentException
54 | 							| InvocationTargetException  | SecurityException e) {
55 | 						e.printStackTrace();
56 | 					}
57 | 				}
58 | 				invokeAll(tasks);
59 | 				for(BatchRecursiveTask task : tasks){
60 | 					values.addAll(task.join());
61 | 				}
62 | 			}
63 | 		}
64 | 		return values;
65 | 	}
66 | 	public abstract Object computeItem(Object item);
67 | }


--------------------------------------------------------------------------------
/xxx-spider/src/main/java/com/zsf/xxx/util/DownloadBatchRecursiveTask.java:
--------------------------------------------------------------------------------
  1 | package com.zsf.xxx.util;
  2 | 
  3 | import java.io.File;
  4 | import java.io.FileOutputStream;
  5 | import java.io.InputStream;
  6 | import java.io.OutputStream;
  7 | import java.net.HttpURLConnection;
  8 | import java.net.SocketTimeoutException;
  9 | import java.net.URL;
 10 | import java.util.List;
 11 | import java.util.concurrent.TimeUnit;
 12 | 
 13 | import javax.net.ssl.SSLException;
 14 | 
 15 | import org.apache.commons.io.FileUtils;
 16 | import org.apache.commons.io.IOUtils;
 17 | 
 18 | import lombok.extern.slf4j.Slf4j;
 19 | 
 20 | /**
 21 |  * @author papapa
 22 |  *
 23 |  */
 24 | @Slf4j
 25 | public class DownloadBatchRecursiveTask extends BatchRecursiveTask {
 26 | 
 27 | 	private static final long serialVersionUID = -6577872267278165509L;
 28 | 
 29 | 	private static final int RETRY_TIMES = 10;// 重试次数
 30 | 	
 31 | 	private static final int CONNECTION_TIMEOUT = 10 * 1000;//连接超时时间10秒
 32 | 	
 33 | 	private static final int READ_TIMEOUT = 10 * 1000;//读超时时间10秒
 34 | 
 35 | 	protected DownloadBatchRecursiveTask(List<?> items, Object ext) {
 36 | 		super(items, ext);
 37 | 	}
 38 | 
 39 | 	@Override
 40 | 	public Object computeItem(Object item) {
 41 | 		FileItemInfo itemInfo = (FileItemInfo) item;
 42 | 		String filePath = itemInfo.getFilePath();
 43 | 		File file = new File(filePath);
 44 | 		long downloadLength = getDownloadLength(itemInfo);
 45 | 		if (file.exists()) {
 46 | 			if (file.length() == downloadLength) {
 47 | 				log.info("分片[{}]已下载成功",itemInfo.getPartIndex());
 48 | 				itemInfo.setDownloadSuccess(true);
 49 | 				return itemInfo;
 50 | 			}
 51 | 		}
 52 | 		downloadPart(itemInfo, RETRY_TIMES);
 53 | 		return itemInfo;
 54 | 	}
 55 | 
 56 | 	private FileItemInfo downloadPart(FileItemInfo itemInfo, int retryTimes) {
 57 | 		String filePath = itemInfo.getFilePath();
 58 | 		int partIndex = itemInfo.getPartIndex();
 59 | 		
 60 | 		String url = itemInfo.getUrl();
 61 | 		String range = getRange(itemInfo);
 62 | 		InputStream input = null;
 63 | 		OutputStream output = null;
 64 | 		if(range == null){
 65 | 			itemInfo.setDownloadSuccess(true);
 66 | 			log.info("分片[{}]下存在，无需下载",itemInfo.getPartIndex());
 67 | 			return itemInfo;
 68 | 		}
 69 | 		if (retryTimes == 0) {
 70 | 			log.error("分片[{}]重试5次后依旧下载失败",partIndex);
 71 | 			return null;
 72 | 		}
 73 | 		log.info("分片[{}],range:{},文件:{}",partIndex, range,filePath);
 74 | 		HttpURLConnection connection = null;
 75 | 		try {
 76 | 			connection = (HttpURLConnection) new URL(url).openConnection();
 77 | 			connection.setConnectTimeout(CONNECTION_TIMEOUT);
 78 | 			connection.setReadTimeout(READ_TIMEOUT);
 79 | 
 80 | 			HttpURLConnection.setFollowRedirects(true);
 81 | 			connection.setRequestProperty("range", "bytes=" + range);
 82 | 			connection.connect();
 83 | 			// 获取响应吗
 84 | 			int responseCode = connection.getResponseCode();
 85 | 			if (responseCode != 206 && responseCode != 200) {
 86 | 				return null;
 87 | 			}
 88 | 			input = connection.getInputStream();
 89 | 			output = new FileOutputStream(filePath, true);
 90 | 			IOUtils.copy(input, output, 1024);
 91 | 			output.flush();
 92 | 			connection.disconnect();
 93 | 			itemInfo.setDownloadSuccess(true);
 94 | 			log.info("分片[{}]下载成功",partIndex);
 95 | 		} catch (Exception e) {
 96 | 			if(!(e instanceof SocketTimeoutException) && !(e instanceof SSLException)){
 97 | 				log.error("分片["+partIndex+"]下载失败:",e);				
 98 | 			}
 99 | 			retryTimes--;
100 | 			log.error("分片[{}]开始重试第[{}]次",partIndex,RETRY_TIMES - retryTimes);
101 | 			try {
102 | 				TimeUnit.SECONDS.sleep(1L);
103 | 			} catch (InterruptedException e1) {
104 | 				e1.printStackTrace();
105 | 			}
106 | 			if(connection != null){
107 | 				connection.disconnect();
108 | 			}
109 | 			IOUtils.closeQuietly(input);
110 | 			IOUtils.closeQuietly(output);
111 | 			
112 | 			downloadPart(itemInfo, retryTimes);
113 | 		}
114 | 		return itemInfo;
115 | 	}
116 | 	
117 | 	/**
118 | 	 * 要下载的长度
119 | 	 * @param itemInfo
120 | 	 * @return
121 | 	 */
122 | 	private long getDownloadLength(FileItemInfo itemInfo){
123 | 		long downloadLength = itemInfo.getEndIndex() - itemInfo.getStartIndex() + (itemInfo.isLastSharding() ? 0 : 1);
124 | 		return downloadLength;
125 | 	}
126 | 	
127 | 	/**
128 | 	 * 获取range   
129 | 	 * @param itemInfo
130 | 	 * @return 当返回null时表示不需要再重新下载
131 | 	 */
132 | 	private String getRange(FileItemInfo itemInfo){
133 | 		String filePath = itemInfo.getFilePath();
134 | 		File tempFile = new File(filePath);
135 | 		FileInfo fileInfo = itemInfo.getFileInfo();
136 | 		long contentLength = fileInfo.getContentLength();
137 | 
138 | 		long tempContentLength = tempFile.exists() ? FileUtils.sizeOf(tempFile) : 0;
139 | 		log.info("临时文件[{}]大小{}Bytes",tempFile.getAbsolutePath(),tempContentLength);
140 | 		String range = itemInfo.getStartIndex() + "-";
141 | 		if(tempContentLength == 0){
142 | 			if (itemInfo.getEndIndex() != contentLength) {
143 | 				 range += itemInfo.getEndIndex();
144 | 			}
145 | 		}else{
146 | 			if (getDownloadLength(itemInfo) == tempContentLength) {
147 | 				range = null;
148 | 			}else{
149 | 				range = itemInfo.getStartIndex() + tempContentLength + "-"+ itemInfo.getEndIndex();
150 | 			}
151 | 		}
152 | 		return range;
153 | 	}
154 | }
155 | 


--------------------------------------------------------------------------------
/xxx-spider/src/main/java/com/zsf/xxx/util/FileInfo.java:
--------------------------------------------------------------------------------
 1 | package com.zsf.xxx.util;
 2 | 
 3 | import lombok.Getter;
 4 | import lombok.Setter;
 5 | import lombok.experimental.Accessors;
 6 | 
 7 | /**
 8 |  * @author papapa
 9 |  *
10 |  */
11 | @Getter
12 | @Setter
13 | @Accessors(chain = true)
14 | public class FileInfo {
15 | 
16 | 	private String url;//文件原始下载地址
17 | 	
18 | 	private String filePath;//文件保存路径
19 | 	
20 | 	private long contentLength;// 内容长度
21 | 
22 | 	private String contentType;// 内容类型
23 | 
24 | 	private long lastModified;// 最后修改时间
25 | 	
26 | 	private boolean suportSharding;//是否支持分片
27 | 
28 | }
29 | 


--------------------------------------------------------------------------------
/xxx-spider/src/main/java/com/zsf/xxx/util/FileItemInfo.java:
--------------------------------------------------------------------------------
 1 | package com.zsf.xxx.util;
 2 | 
 3 | import lombok.Getter;
 4 | import lombok.Setter;
 5 | import lombok.experimental.Accessors;
 6 | 
 7 | /**
 8 |  * @author papapa
 9 |  *
10 |  */
11 | @Getter
12 | @Setter
13 | @Accessors(chain = true)
14 | public class FileItemInfo implements Comparable<FileItemInfo> {
15 | 
16 | 	private FileInfo fileInfo;
17 | 
18 | 	private String url;// 下载地址
19 | 
20 | 	private String filePath;// 文件名
21 | 
22 | 	private int partIndex;// 分片位置
23 | 
24 | 	private long startIndex;// 文件开始下载位置
25 | 
26 | 	private long endIndex;// 文件结束位置
27 | 	
28 | 	private boolean downloadSuccess;//是否下载成功
29 | 
30 | 	private boolean firstSharding;//是否首片
31 | 	
32 | 	private boolean lastSharding;//是否最后一片
33 | 	
34 | 	@Override
35 | 	public int compareTo(FileItemInfo o) {
36 | 		int partIndex1 = this.getPartIndex();
37 | 		int partIndex2 = o.getPartIndex();
38 | 		if (partIndex1 < partIndex2) {
39 | 			return -1;
40 | 		} else if (partIndex1 > partIndex2) {
41 | 			return 1;
42 | 		} else {
43 | 			return 0;
44 | 		}
45 | 	}
46 | }
47 | 


--------------------------------------------------------------------------------
/xxx-spider/src/main/java/com/zsf/xxx/util/HttpDownloader.java:
--------------------------------------------------------------------------------
  1 | package com.zsf.xxx.util;
  2 | 
  3 | import java.io.File;
  4 | import java.io.IOException;
  5 | import java.io.RandomAccessFile;
  6 | import java.net.HttpURLConnection;
  7 | import java.net.URL;
  8 | import java.util.ArrayList;
  9 | import java.util.Collections;
 10 | import java.util.List;
 11 | 
 12 | import org.apache.commons.codec.digest.DigestUtils;
 13 | import org.apache.commons.io.FileUtils;
 14 | import org.apache.commons.lang3.builder.ToStringBuilder;
 15 | import org.apache.commons.lang3.builder.ToStringStyle;
 16 | 
 17 | import lombok.extern.slf4j.Slf4j;
 18 | 
 19 | /**
 20 |  * @author papapa
 21 |  *
 22 |  */
 23 | @Slf4j
 24 | public class HttpDownloader {
 25 | 	
 26 | 	private static final String TMP_PATH = FileUtils.getTempDirectoryPath();// 临时文件保存目录
 27 | 	
 28 | 	private static final int CONNECTION_TIMEOUT = 10 * 1000;//连接超时时间10秒
 29 | 	
 30 | 	private static final int READ_TIMEOUT = 10 * 1000;//读超时时间10秒
 31 | 	
 32 | 	
 33 | 	public void download(String url, String filePath) throws IOException {
 34 | 		FileInfo fileInfo = getFileInfo(url);
 35 | 		if (fileInfo == null) {
 36 | 			throw new IOException("获取文件信息失败:["+url+"]");
 37 | 		}
 38 | 		fileInfo.setFilePath(filePath);
 39 | 		log.info("文件信息:{}",ToStringBuilder.reflectionToString(fileInfo, ToStringStyle.JSON_STYLE));
 40 | 		
 41 | 		File savedFile = new File(filePath);
 42 | 		if (savedFile.exists()) {
 43 | 			if (FileUtils.sizeOf(savedFile) == fileInfo.getContentLength()) {
 44 | 				log.info("文件已下载完成:[{}]", filePath);
 45 | 				return;
 46 | 			}
 47 | 			FileUtils.forceDelete(savedFile);
 48 | 		}
 49 | 		savedFile.createNewFile();
 50 | 		if(fileInfo.isSuportSharding()){//支持分片下载
 51 | 			long contentLength = fileInfo.getContentLength(); 
 52 | 			int threadNum = Runtime.getRuntime().availableProcessors() * 2;
 53 | 			if(contentLength <= threadNum){
 54 | 				threadNum = 1;
 55 | 			}
 56 | 			multiThreadsDownload(fileInfo, threadNum);
 57 | 		}else{
 58 | 			singleDownload(fileInfo);
 59 | 		}
 60 | 	}
 61 | 	
 62 | 	/**
 63 | 	 * 单线程下载
 64 | 	 * @param fileInfo
 65 | 	 */
 66 | 	private void singleDownload(FileInfo fileInfo) {
 67 | 		
 68 | 	}
 69 | 
 70 | 	/**
 71 | 	 * 开始下载文件
 72 | 	 * @param fileInfo
 73 | 	 * @param threadNum
 74 | 	 * @throws IOException
 75 | 	 */
 76 | 	private void multiThreadsDownload(FileInfo fileInfo, int threadNum) throws IOException {
 77 | 		long contentLength = fileInfo.getContentLength();
 78 | 		if(contentLength <= threadNum){//内容长度小于等于线程数时，只需要一个线程下载
 79 | 			threadNum = 1;
 80 | 		}
 81 | 		List<FileItemInfo> fileItemInfos = new ArrayList<>();
 82 | 		
 83 | 		long partSize = contentLength / threadNum + 1;
 84 | 
 85 | 		log.info("每片大小[{}]Bytes",partSize);
 86 | 		for (int i = 0; i < threadNum; i++) {
 87 | 			long startIndex = i * partSize;
 88 | 			startIndex = i == 0 ? 0 : fileItemInfos.get(i-1).getEndIndex()+1L;
 89 | 			
 90 | 			long endIndex = startIndex + partSize -1 ;
 91 | 
 92 | 			if (i == threadNum - 1) {
 93 | 				endIndex = fileInfo.getContentLength();
 94 | 			}
 95 | 			log.info("分片[{}]区间:{}-{}，大小:{}",i+1,startIndex,endIndex,endIndex-startIndex+1);
 96 | 			FileItemInfo fileItemInfo = new FileItemInfo();
 97 | 			fileItemInfo.setPartIndex(i+1);
 98 | 			fileItemInfo.setUrl(fileInfo.getUrl());
 99 | 			fileItemInfo.setStartIndex(startIndex);
100 | 			fileItemInfo.setFilePath(getTmpFilePath(fileInfo, i+1));
101 | 			fileItemInfo.setEndIndex(endIndex);
102 | 			fileItemInfo.setFileInfo(fileInfo);
103 | 			fileItemInfo.setFirstSharding(i==0);
104 | 			fileItemInfo.setLastSharding(i == threadNum - 1);
105 | 			fileItemInfos.add(fileItemInfo);
106 | 		}
107 | 
108 | 		@SuppressWarnings("unchecked")
109 | 		List<FileItemInfo> results = (List<FileItemInfo>) ParallelComputeUtil.compute(new DownloadBatchRecursiveTask(fileItemInfos, null));
110 | 		mergeFile(results);
111 | 
112 | 	}
113 | 
114 | 	/**
115 | 	 * 合并文件
116 | 	 * @param results
117 | 	 * @throws IOException
118 | 	 */
119 | 	private void mergeFile(List<FileItemInfo> results) throws IOException {
120 | 		if(results == null){
121 | 			log.info("下载失败");
122 | 			return;
123 | 		}
124 | 		Collections.sort(results);
125 | 		for(FileItemInfo itemInfo : results){
126 | 			if(!itemInfo.isDownloadSuccess()){
127 | 				log.error("分片[{}]下载失败，取消合并文件",itemInfo.getPartIndex());
128 | 				return;
129 | 			}
130 | 		}
131 | 		addPartIntoFile(results);
132 | 		String filePath = results.get(0).getFileInfo().getFilePath();
133 | 		log.info("文件[{}]下载完成,大小:{}Bytes",filePath,FileUtils.sizeOf(new File(filePath)));
134 | 		deleteTempFile(results);
135 | 	}
136 | 
137 | 	/**
138 | 	 * 删除临时文件
139 | 	 * @param items
140 | 	 */
141 | 	private void deleteTempFile(List<FileItemInfo> items){
142 | 		for(FileItemInfo item : items){
143 | 			boolean flag = FileUtils.deleteQuietly(new File(item.getFilePath()));
144 | 			log.info("临时文件[{}]删除{}",item.getFilePath(),flag ? "成功" : "失败");
145 | 		}
146 | 	}
147 | 	/**
148 | 	 * 将分片下载的文件写入主文件
149 | 	 * @param itemInfo
150 | 	 * @throws IOException
151 | 	 */
152 | 	private void addPartIntoFile(List<FileItemInfo> itemInfos) throws IOException{
153 | 		String filePath = itemInfos.get(0).getFileInfo().getFilePath();
154 | 		RandomAccessFile file = new RandomAccessFile(new File(filePath), "rw");
155 | 		for(FileItemInfo itemInfo : itemInfos){
156 | 			file.seek(itemInfo.getStartIndex());
157 | 			File tempFile = new File(itemInfo.getFilePath());
158 | 			file.write(FileUtils.readFileToByteArray(tempFile));
159 | 		}
160 | 		file.close();
161 | 	}
162 | 	
163 | 	/**
164 | 	 * 获取文件信息
165 | 	 * @param url
166 | 	 * @return
167 | 	 * @throws IOException
168 | 	 */
169 | 	private FileInfo getFileInfo(String url) throws IOException {
170 | 		HttpURLConnection connection = (HttpURLConnection) new URL(url).openConnection();
171 | 		connection.setConnectTimeout(CONNECTION_TIMEOUT);
172 | 		connection.setReadTimeout(READ_TIMEOUT);
173 | 		HttpURLConnection.setFollowRedirects(true);
174 | 		connection.connect();
175 | 		// 获取响应吗
176 | 		int responseCode = connection.getResponseCode();
177 | 		log.info("[{}]响应码为:{}", url, responseCode);
178 | 		if (responseCode != 200) {
179 | 			return null;
180 | 		}
181 | 		boolean suportSharding = false;
182 | 		if("bytes".equals(connection.getHeaderField("Accept-Ranges"))){
183 | 			suportSharding = true;
184 | 		}
185 | 
186 | 		String contentType = connection.getContentType();
187 | 		long contentLength = connection.getContentLengthLong();
188 | 		long lastModified = connection.getLastModified();
189 | 		connection.disconnect();
190 | 		
191 | 		FileInfo fileInfo = new FileInfo().setUrl(url).setContentLength(contentLength).setContentType(contentType)
192 | 				.setLastModified(lastModified).setSuportSharding(suportSharding);
193 | 
194 | 		return fileInfo;
195 | 	}
196 | 
197 | 	/**
198 | 	 * 获取临时文件路径
199 | 	 * 
200 | 	 * @param url
201 | 	 * @return
202 | 	 */
203 | 	private String getTmpFilePath(FileInfo fileInfo, int partIndex) {
204 | 		String contentType = fileInfo.getContentType();
205 | 		long contentLength = fileInfo.getContentLength();
206 | 		long lastModified = fileInfo.getLastModified();
207 | 		return TMP_PATH+DigestUtils.md5Hex(contentType + contentLength + lastModified) + ".tmp_" + partIndex;
208 | 		//return StringUtils.substringBeforeLast(filePath, File.separator)+File.separator+DigestUtils.md5Hex(url + filePath + contentType + contentLength + lastModified) + ".tmp_" + partIndex;
209 | 	}
210 | }
211 | 


--------------------------------------------------------------------------------
/xxx-spider/src/main/java/com/zsf/xxx/util/ParallelComputeUtil.java:
--------------------------------------------------------------------------------
 1 | package com.zsf.xxx.util;
 2 | 
 3 | import java.util.List;
 4 | import java.util.concurrent.ForkJoinPool;
 5 | import java.util.concurrent.RecursiveAction;
 6 | import java.util.concurrent.RecursiveTask;
 7 | 
 8 | /**
 9 |  * @author papapa 并发调用工具类
10 |  */
11 | public class ParallelComputeUtil {
12 | 
13 | 	public static <V> V compute(RecursiveTask<V> recursiveTask) {
14 | 		ForkJoinPool forkJoinPool = new ForkJoinPool();
15 | 		V value = forkJoinPool.invoke(recursiveTask);
16 | 		forkJoinPool.shutdown();
17 | 		return value;
18 | 	}
19 | 
20 | 	public static List<?> compute(BatchRecursiveTask recursiveTask) {
21 | 		ForkJoinPool forkJoinPool = new ForkJoinPool(Runtime.getRuntime().availableProcessors() * 2);
22 | 		List<?> value = forkJoinPool.invoke(recursiveTask);
23 | 		forkJoinPool.shutdown();
24 | 		return value;
25 | 	}
26 | 
27 | 	public static void execute(RecursiveAction recursiveAction) {
28 | 		ForkJoinPool forkJoinPool = new ForkJoinPool();
29 | 		forkJoinPool.execute(recursiveAction);
30 | 		forkJoinPool.shutdown();
31 | 	}
32 | 
33 | 	public static void execute(BatchRecursiveAction recursiveAction) {
34 | 		ForkJoinPool forkJoinPool = new ForkJoinPool();
35 | 		forkJoinPool.execute(recursiveAction);
36 | 		forkJoinPool.shutdown();
37 | 	}
38 | }
39 | 


--------------------------------------------------------------------------------
/xxx-spider/src/main/resources/LICENSE:
--------------------------------------------------------------------------------
1 | 技术无罪，请把你的“技术”应用到实战中!


--------------------------------------------------------------------------------
/xxx-spider/src/main/resources/log4j2.xml:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="UTF-8"?>
 2 | <Configuration status="WARN">
 3 | 	<Appenders>
 4 | 		<Console name="Console" target="SYSTEM_OUT">
 5 | 			<PatternLayout pattern="%d{HH:mm:ss.SSS} %-5level %logger{36}:%L - %msg%n" />
 6 | 		</Console>
 7 | 	</Appenders>
 8 | 	<Loggers>
 9 | 		<Logger name="com.zsf.pornhub" level="INFO" />
10 | 		<Root level="INFO">
11 | 			<AppenderRef ref="Console" />
12 | 		</Root>
13 | 	</Loggers>
14 | </Configuration>


--------------------------------------------------------------------------------
/xxx-spider/src/test/java/com/zsf/xxx/JunitTest.java:
--------------------------------------------------------------------------------
  1 | package com.zsf.xxx;
  2 | 
  3 | import java.io.File;
  4 | import java.io.FileInputStream;
  5 | import java.io.FileNotFoundException;
  6 | import java.io.FileOutputStream;
  7 | import java.io.IOException;
  8 | import java.io.InputStream;
  9 | import java.net.HttpURLConnection;
 10 | import java.net.URL;
 11 | import java.util.HashMap;
 12 | import java.util.Map;
 13 | 
 14 | import org.apache.commons.io.IOUtils;
 15 | import org.apache.commons.lang3.StringUtils;
 16 | import org.apache.tika.exception.TikaException;
 17 | import org.apache.tika.metadata.Metadata;
 18 | import org.apache.tika.parser.AutoDetectParser;
 19 | import org.apache.tika.sax.BodyContentHandler;
 20 | import org.jsoup.nodes.Document;
 21 | import org.jsoup.select.Elements;
 22 | import org.junit.Test;
 23 | import org.xml.sax.SAXException;
 24 | 
 25 | import com.alibaba.fastjson.JSON;
 26 | import com.alibaba.fastjson.JSONPath;
 27 | import com.zsf.xxx.http.HttpDebugerViewer;
 28 | import com.zsf.xxx.http.HttpViewer;
 29 | import com.zsf.xxx.http.JsonpAfeldViewer;
 30 | import com.zsf.xxx.util.HttpDownloader;
 31 | 
 32 | import lombok.extern.slf4j.Slf4j;
 33 | 
 34 | /**
 35 |  * @author papapa
 36 |  *
 37 |  */
 38 | @Slf4j
 39 | public class JunitTest {
 40 | 
 41 | 	@Test
 42 | 	public void testGetVideoUrl() throws IOException {
 43 | 		String viewUrl = "https://cn.pornhub.com/view_video.php?viewkey=ph5d56b96c279c8";
 44 | 		Document doc = HttpViewer.getRandomInstance().getResponseDoc(viewUrl);
 45 | 		String scriptStr = doc.selectFirst("div#player > script").toString();
 46 | 		scriptStr = "{" + StringUtils.substringBetween(scriptStr, "{", "};") + "}";
 47 | 
 48 | 		// JSONObject json = JSON.parseObject(scriptStr,JSONObject.class);
 49 | 		String videoUrl = (String) JSONPath.eval(scriptStr,
 50 | 				"$.mediaDefinitions[quality='720'][format='mp4'][0].videoUrl");// 获取720p且为mp4格式的地址
 51 | 
 52 | 		System.out.println(videoUrl);
 53 | 	}
 54 | 
 55 | 	public static void main(String[] args) throws IOException {
 56 | 		//String url = "https://vid3-l3.xvideos-cdn.com/videos/mp4/2/1/3/xvideos.com_213d9cdeb355e88c4ac217af62911445.mp4?e=1568258415&ri=1024&rs=85&h=b6b50269fb96d71fb8577e0ac76ca7bb";
 57 | 		String url = "https://vid1-l3.xvideos-cdn.com/videos/mp4/9/7/0/xvideos.com_9707facedd27ab26bfed74db83328504.mp4?e=1568822783&ri=1024&rs=85&h=42d19ed678f5cf1fa3be0594e63a54d1";
 58 | 		//String url = "https://ardownload2.adobe.com/pub/adobe/reader/mac/AcrobatDC/1901220034/AcroRdrDC_1901220034_MUI.dmg";
 59 | 		String filePath = "/Users/zsf/git/xvidoes/mp4/fontawesome-free-5.9.0-web.zip";
 60 | 		long startTime = System.currentTimeMillis();
 61 | 		new HttpDownloader().download(url, filePath);
 62 | 		log.info("共耗时:{}秒",System.currentTimeMillis() - startTime);
 63 | 	}
 64 | 	
 65 | 	@Test
 66 | 	public void test2() throws IOException{
 67 | 		String url = "https://pcs.baidu.com/rest/2.0/pcs/file?method=download&path=%2F%E7%94%B5%E5%BD%B1%2F%E4%BE%8F%E7%BD%97%E7%BA%AA%E5%85%AC%E5%9B%AD%E4%B8%89%E9%83%A8%E6%9B%B2%2FJurassic.Park.1.1993.mkv&random=0.7139769215592586&app_id=498065";
 68 | 		String filePath = "/Users/zsf/git/xvidoes/mp4/test.mkv";
 69 | 		HttpURLConnection connection = (HttpURLConnection) new URL(url).openConnection();
 70 | 		connection.setConnectTimeout(6 * 1000);// 超时6秒重连
 71 | 		connection.setReadTimeout(6 * 1000);
 72 | 		
 73 | 		HttpURLConnection.setFollowRedirects(true);
 74 | 		connection.setRequestProperty("range", "bytes=10-");
 75 | 		connection.connect();
 76 | 		// 获取响应吗
 77 | 		int responseCode = connection.getResponseCode();
 78 | 		if (responseCode != 206 && responseCode != 200) {
 79 | 			return;
 80 | 		}
 81 | 		InputStream input = connection.getInputStream();
 82 | 		IOUtils.copy(input, new FileOutputStream(new File(filePath)),1024);
 83 | 		connection.disconnect();
 84 | 	}
 85 | 	
 86 | 	@Test
 87 | 	public void testHttpDebugerViewer() throws IOException{
 88 | 		Document doc = new HttpDebugerViewer().getResponseDoc("https://www.xvideos.com/c/Squirting-56?quality=hd");
 89 | 		Elements insides = doc.select("div#content > div.mozaique > div div.thumb-inside");
 90 | 		log.info(insides.html());
 91 | 	}
 92 | 	
 93 | 	@Test
 94 | 	public void testJsonpAfeldViewer() throws IOException{
 95 | 		Document doc = new JsonpAfeldViewer().getResponseDoc("https://www.xvideos.com/c/Squirting-56?quality=hd");
 96 | 		Elements insides = doc.select("div#content > div.mozaique > div div.thumb-inside");
 97 | 		log.info(insides.html());
 98 | 	}
 99 | 	
100 | 	@Test
101 | 	public void testTika() throws FileNotFoundException {
102 | 		InputStream input = new FileInputStream(new File("/Users/zsf/Desktop/2019年秋学籍模板.xls"));
103 | 		Map<String,String> metaData = getMetaData(input);
104 | 		log.info(JSON.toJSONString(metaData, true));
105 | 	}
106 | 	
107 | 	private Map<String,String> getMetaData(InputStream input){
108 | 		Map<String,String> metadataMap = null;
109 | 		BodyContentHandler handler = new BodyContentHandler();
110 | 		Metadata metadata = new Metadata();
111 | 		AutoDetectParser parser = new AutoDetectParser();
112 | 		try {
113 | 			parser.parse(new FileInputStream(new File("/Users/zsf/Desktop/2019年秋学籍模板.xls")), handler, metadata);
114 | 			metadata.add("downloadUrl", "https://www.baidu.com");
115 | 			metadataMap = new HashMap<>();
116 | 			String[] metadataNames = metadata.names();
117 | 			for(String metaName : metadataNames){
118 | 				metadataMap.put(metaName,metadata.get(metaName));
119 | 			}
120 | 		} catch (IOException | SAXException | TikaException e) {
121 | 			e.printStackTrace();
122 | 		}
123 | 		return metadataMap;
124 | 	}
125 | }
126 | 


--------------------------------------------------------------------------------