├── README.md
└── xxx-spider
├── .gitignore
├── pom.xml
└── src
├── main
├── java
│ └── com
│ │ └── zsf
│ │ └── xxx
│ │ ├── AbstractCrawler.java
│ │ ├── Crawler.java
│ │ ├── Launcher.java
│ │ ├── PornhubCrawler.java
│ │ ├── XvideosCrawler.java
│ │ ├── http
│ │ ├── AbstractHttpViewer.java
│ │ ├── CorsAnywhereViewer.java
│ │ ├── HttpDebugerViewer.java
│ │ ├── HttpViewer.java
│ │ └── JsonpAfeldViewer.java
│ │ └── util
│ │ ├── BatchRecursiveAction.java
│ │ ├── BatchRecursiveTask.java
│ │ ├── DownloadBatchRecursiveTask.java
│ │ ├── FileInfo.java
│ │ ├── FileItemInfo.java
│ │ ├── HttpDownloader.java
│ │ └── ParallelComputeUtil.java
└── resources
│ ├── LICENSE
│ └── log4j2.xml
└── test
└── java
└── com
└── zsf
└── xxx
└── JunitTest.java
/README.md:
--------------------------------------------------------------------------------
1 | # xxx
2 | 不用梯子获取pornhub xvideos的视频地址。获取到地址后可直接用迅雷批量下载视频文件,也可用VLC在线播放视频文件。
3 |
--------------------------------------------------------------------------------
/xxx-spider/.gitignore:
--------------------------------------------------------------------------------
1 | /target/
2 | /.classpath
3 | /.project
4 | /.settings/
5 |
--------------------------------------------------------------------------------
/xxx-spider/pom.xml:
--------------------------------------------------------------------------------
1 |
3 | 4.0.0
4 | com.zsf.xxx
5 | xxx-spider
6 | 1.0.0
7 |
8 |
9 | commons-io
10 | commons-io
11 | 2.5
12 |
13 |
14 | org.apache.commons
15 | commons-lang3
16 | 3.9
17 |
18 |
19 | org.apache.commons
20 | commons-text
21 | 1.8
22 |
23 |
24 | com.squareup.okhttp3
25 | okhttp
26 | 4.1.0
27 |
28 |
29 | org.jsoup
30 | jsoup
31 | 1.12.1
32 |
33 |
34 | org.apache.logging.log4j
35 | log4j-slf4j-impl
36 | 2.12.1
37 |
38 |
39 | com.alibaba
40 | fastjson
41 | 1.2.60
42 |
43 |
44 | junit
45 | junit
46 | 4.12
47 | test
48 |
49 |
50 | org.projectlombok
51 | lombok
52 | 1.18.8
53 | provided
54 |
55 |
56 | commons-codec
57 | commons-codec
58 | 1.13
59 |
60 |
61 | org.apache.tika
62 | tika-parsers
63 | 1.22
64 |
65 |
66 |
67 |
68 |
69 | org.apache.maven.plugins
70 | maven-assembly-plugin
71 | 3.1.1
72 |
73 |
74 |
75 | true
76 | com.zsf.xxx.Launcher
77 |
78 | true
79 |
80 |
81 | jar-with-dependencies
82 |
83 | UTF-8
84 |
85 |
86 |
87 | assemble-all
88 | package
89 |
90 | single
91 |
92 |
93 |
94 |
95 |
96 |
97 |
--------------------------------------------------------------------------------
/xxx-spider/src/main/java/com/zsf/xxx/AbstractCrawler.java:
--------------------------------------------------------------------------------
1 | package com.zsf.xxx;
2 |
3 | import java.io.File;
4 | import java.io.IOException;
5 | import java.util.ArrayList;
6 | import java.util.List;
7 | import java.util.Map;
8 |
9 | import org.apache.commons.io.FileUtils;
10 | import org.apache.commons.lang3.StringUtils;
11 | import org.slf4j.Logger;
12 | import org.slf4j.LoggerFactory;
13 |
14 | /**
15 | * @author papapa
16 | *
17 | */
18 | public abstract class AbstractCrawler implements Crawler{
19 |
20 | private static final Logger log = LoggerFactory.getLogger(AbstractCrawler.class);
21 |
22 | @Override
23 | public void execute(String dir) throws IOException {
24 | if (StringUtils.isBlank(dir)) {
25 | throw new IOException("文件保存目录为空");
26 | }
27 | Map categories = getCategories();
28 | if (categories != null) {
29 | for (Map.Entry entry : categories.entrySet()) {
30 | String title = entry.getKey();
31 | String href = entry.getValue();
32 | List viewUrls = getViewUrls(href);
33 | log.info("获取所有[{}]播放地址成功:{}", title, viewUrls);
34 | List videoUrls = getVideoUrls(viewUrls);
35 | log.info("获取所有[{}]视频地址成功:{}", title, videoUrls);
36 |
37 | String downloadFileName = dir + File.separatorChar + title + File.separatorChar + "download" + ".txt";
38 | FileUtils.writeLines(new File(downloadFileName),"UTF-8", videoUrls, false);
39 | log.info("[{}]视频链接保存在{}",title,downloadFileName);
40 |
41 | log.info("开始下载[{}]下的视频",title);
42 | downloadVideos(dir + File.separatorChar + title, videoUrls);
43 | log.info("结束下载[{}]下的视频",title);
44 | }
45 | }
46 | }
47 |
48 | public void downloadVideos(String dir,List videoUrls){
49 | for(String videoUrl : videoUrls){
50 | downloadVideo(dir,videoUrl);
51 | }
52 | }
53 |
54 | public List getVideoUrls(List viewUrls) {
55 | List videoUrls = new ArrayList<>();
56 | if (viewUrls != null && viewUrls.size() > 0) {
57 | for (String viewUrl : viewUrls) {
58 | String videoUrl = getVideoUrl(viewUrl);
59 | if(StringUtils.isNotBlank(videoUrl)){
60 | videoUrls.add(videoUrl);
61 | }
62 | }
63 | }
64 | return videoUrls;
65 | }
66 | }
67 |
--------------------------------------------------------------------------------
/xxx-spider/src/main/java/com/zsf/xxx/Crawler.java:
--------------------------------------------------------------------------------
1 | package com.zsf.xxx;
2 |
3 | import java.io.IOException;
4 | import java.util.List;
5 | import java.util.Map;
6 |
7 | /**
8 | *
9 | * @author papapa
10 | *
11 | */
12 | public interface Crawler {
13 |
14 | /**
15 | * 爬虫入口
16 | *
17 | * @param dir
18 | * 文件保存目录
19 | * @throws IOException
20 | */
21 | void execute(String dir) throws IOException;
22 |
23 | /**
24 | * 获取分类集合
25 | *
26 | * @return
27 | */
28 | Map getCategories();
29 |
30 | /**
31 | * 获取视频地址
32 | *
33 | * @param href
34 | * @return
35 | */
36 | List getViewUrls(String href);
37 |
38 | /**
39 | * 获取视频高清地址
40 | *
41 | * @param viewUrl
42 | * @return
43 | * @throws IOException
44 | */
45 | String getVideoUrl(String viewUrl);
46 |
47 | /**
48 | * 下载视频
49 | * @param dir 文件保存目录
50 | * @param videoUrl 视频地址
51 | * @return
52 | */
53 | boolean downloadVideo(String dir,String videoUrl);
54 | }
55 |
--------------------------------------------------------------------------------
/xxx-spider/src/main/java/com/zsf/xxx/Launcher.java:
--------------------------------------------------------------------------------
1 | package com.zsf.xxx;
2 |
3 | import java.io.IOException;
4 | import java.io.InputStream;
5 | import java.nio.charset.Charset;
6 | import java.util.List;
7 |
8 | import org.apache.commons.io.FileUtils;
9 | import org.apache.commons.io.IOUtils;
10 | import org.apache.commons.lang3.StringUtils;
11 | import org.slf4j.Logger;
12 | import org.slf4j.LoggerFactory;
13 |
14 | /**
15 | * @author papapa
16 | *
17 | */
18 | public class Launcher {
19 |
20 | private static final Logger log = LoggerFactory.getLogger(Launcher.class);
21 |
22 | public static void main(String[] args) throws IOException {
23 | InputStream in = Launcher.class.getResourceAsStream("/LICENSE");
24 | if(in != null){
25 | List lines = IOUtils.readLines(in,Charset.forName("UTF-8"));
26 | log.info(StringUtils.join(lines,"\r\n"));
27 | }
28 | log.info("爬虫开始");
29 | String fileDir = FileUtils.getTempDirectoryPath();//默认文件保存目录
30 | if(args != null && args.length > 0){
31 | fileDir = args[0];
32 | }
33 | log.info("视频地址文件保存在:{}",fileDir);
34 | //new PornhubCrawler().execute(fileDir);
35 | new XvideosCrawler().execute(fileDir);
36 | log.info("爬虫停止");
37 | }
38 | }
39 |
--------------------------------------------------------------------------------
/xxx-spider/src/main/java/com/zsf/xxx/PornhubCrawler.java:
--------------------------------------------------------------------------------
1 | package com.zsf.xxx;
2 |
3 | import java.io.IOException;
4 | import java.util.ArrayList;
5 | import java.util.HashMap;
6 | import java.util.List;
7 | import java.util.Map;
8 |
9 | import org.apache.commons.lang3.StringUtils;
10 | import org.jsoup.nodes.Document;
11 | import org.jsoup.nodes.Element;
12 | import org.jsoup.select.Elements;
13 | import org.slf4j.Logger;
14 | import org.slf4j.LoggerFactory;
15 |
16 | import com.alibaba.fastjson.JSONPath;
17 | import com.zsf.xxx.http.HttpViewer;
18 |
19 | /**
20 | * @author papapa
21 | *
22 | */
23 | public class PornhubCrawler extends AbstractCrawler {
24 |
25 | private static final Logger log = LoggerFactory.getLogger(PornhubCrawler.class);
26 |
27 | private static final String BASE_URL = "https://cn.pornhub.com";// 中文pornhub
28 |
29 | private static final String PARAMS = "hd=1";// 高清
30 |
31 | /**
32 | * 下载视频
33 | *
34 | * @param dir
35 | * @param videoUrl
36 | */
37 | @Override
38 | public boolean downloadVideo(String dir, String videoUrl) {
39 | return false;
40 | }
41 |
42 | /**
43 | * 获取视频地址(720p mp4格式)
44 | *
45 | * @param viewUrl
46 | * @return
47 | * @throws IOException
48 | */
49 | @Override
50 | public String getVideoUrl(String viewUrl) {
51 | String videoUrl = null;
52 | Document doc;
53 | try {
54 | doc = HttpViewer.getRandomInstance().getResponseDoc(viewUrl);
55 | if (doc == null) {
56 | return null;
57 | }
58 | String scriptStr = doc.selectFirst("div#player > script").toString();
59 | scriptStr = "{" + StringUtils.substringBetween(scriptStr, "{", "};") + "}";
60 |
61 | // 获取720p且为mp4格式的地址
62 | videoUrl = (String) JSONPath.eval(scriptStr, "$.mediaDefinitions[quality='720'][format='mp4'][0].videoUrl");
63 | } catch (IOException e) {
64 | log.error("获取视频地址错误:[" + videoUrl + "]", e);
65 | }
66 |
67 | return videoUrl;
68 | }
69 |
70 | /**
71 | * 获取该分类地址下的所有播放地址
72 | *
73 | * @param href
74 | * @return
75 | * @throws IOException
76 | */
77 | @Override
78 | public List getViewUrls(String href) {
79 | List viewUrls = new ArrayList<>();
80 | Document doc;
81 | try {
82 | doc = HttpViewer.getRandomInstance().getResponseDoc(href);
83 | if (doc == null) {
84 | return null;
85 | }
86 | List pageUrls = getPageUrls(doc);
87 | if (pageUrls != null) {
88 | viewUrls.addAll(pageUrls);
89 | }
90 |
91 | Element nextPageElement = doc.selectFirst("div.pagination3 > ul > li.page_next > a");
92 | if (nextPageElement != null) {
93 | String nextPageHref = BASE_URL + "/" + nextPageElement.attr("href");// 下一页地址
94 | if (nextPageHref.endsWith("page=2")) {// 为了测试只获取1页数据
95 | return viewUrls;
96 | }
97 | viewUrls.addAll(getViewUrls(nextPageHref));// 递规获取每一页中的播放地址
98 | }
99 | } catch (IOException e) {
100 | log.error("获取文档对象错误:[" + href + "]", e);
101 | }
102 |
103 | return viewUrls;
104 | }
105 |
106 | /**
107 | * 获取当前页中每个的播放地址
108 | *
109 | * @param doc
110 | * @return
111 | */
112 | public List getPageUrls(Document doc) {
113 | List urls = null;
114 | Elements lis = doc.select("ul#videoCategory > li.js-pop");
115 | if (lis != null && lis.size() > 0) {
116 | urls = new ArrayList<>();
117 | for (Element li : lis) {
118 | urls.add(BASE_URL + "/view_video.php?viewkey=" + li.attr("_vkey"));
119 | }
120 | }
121 | return urls;
122 | }
123 |
124 | /**
125 | * 获取分类集合
126 | *
127 | * @return
128 | * @throws IOException
129 | */
130 | @Override
131 | public Map getCategories() {
132 | Map result = null;
133 | Document doc;
134 | try {
135 | doc = HttpViewer.getRandomInstance().getResponseDoc(BASE_URL + "/categories");
136 | if (doc == null) {
137 | return null;
138 | }
139 | result = new HashMap<>();
140 | Elements lis = doc.select("ul#categoriesListSection > li");
141 | for (Element li : lis) {
142 | String href = BASE_URL + li.selectFirst("div.category-wrapper > a").attr("href");
143 | if (href.contains("?")) {// 只获取高清视频
144 | href += "&" + PARAMS;
145 | } else {
146 | href += "?" + PARAMS;
147 | }
148 | String title = li.selectFirst("div.category-wrapper > h5 > a").attr("data-mxptext");
149 | result.put(title, href);
150 | log.info("分类:{},地址:{}", title, href);
151 | if (result.size() > 2)
152 | break;// 只获取3个
153 | }
154 | } catch (IOException e) {
155 | log.error("获取分类错误:", e);
156 | }
157 | return result;
158 | }
159 | }
160 |
--------------------------------------------------------------------------------
/xxx-spider/src/main/java/com/zsf/xxx/XvideosCrawler.java:
--------------------------------------------------------------------------------
1 | package com.zsf.xxx;
2 |
3 | import java.io.File;
4 | import java.io.IOException;
5 | import java.util.ArrayList;
6 | import java.util.HashMap;
7 | import java.util.List;
8 | import java.util.Map;
9 |
10 | import org.apache.commons.lang3.StringUtils;
11 | import org.jsoup.nodes.Document;
12 | import org.jsoup.nodes.Element;
13 | import org.jsoup.select.Elements;
14 | import org.slf4j.Logger;
15 | import org.slf4j.LoggerFactory;
16 |
17 | import com.zsf.xxx.http.HttpViewer;
18 | import com.zsf.xxx.util.HttpDownloader;
19 |
20 | /**
21 | * @author papapa
22 | *
23 | */
24 | public class XvideosCrawler extends AbstractCrawler{
25 |
26 | private static final Logger log = LoggerFactory.getLogger(XvideosCrawler.class);
27 |
28 | private static final String BASE_URL = "https://www.xvideos.com";
29 |
30 | /**
31 | * 获取视频高清地址
32 | *
33 | * @param viewUrl
34 | * @return
35 | * @throws IOException
36 | */
37 | @Override
38 | public String getVideoUrl(String viewUrl) {
39 | String videoUrl = null;
40 | Document doc;
41 | try {
42 | doc = HttpViewer.getRandomInstance().getResponseDoc(viewUrl);
43 | if (doc == null) {
44 | return null;
45 | }
46 | Elements scripts = doc.getElementsByTag("script");
47 | if (scripts == null || scripts.size() == 0) {
48 | return null;
49 | }
50 | for (Element script : scripts) {
51 | String scriptStr = script.html();
52 | if (StringUtils.contains(scriptStr, "html5player.setVideoUrlHigh('")) {
53 | videoUrl = StringUtils.substringBetween(scriptStr, "html5player.setVideoUrlHigh('", "');");
54 | break;
55 | }
56 | }
57 | } catch (IOException e) {
58 | log.error("获取视频地址错误:[" + viewUrl + "]", e);
59 | }
60 |
61 | return videoUrl;
62 | }
63 |
64 | /**
65 | * 获取该分类地址下的所有播放地址
66 | *
67 | * @param href
68 | * @return
69 | * @throws IOException
70 | */
71 | @Override
72 | public List getViewUrls(String href) {
73 | List viewUrls = new ArrayList<>();
74 | Document doc;
75 | try {
76 | doc = HttpViewer.getRandomInstance().getResponseDoc(href);
77 | if (doc == null) {
78 | return null;
79 | }
80 | List pageUrls = getPageUrls(doc);
81 | if (pageUrls != null) {
82 | viewUrls.addAll(pageUrls);
83 | }
84 |
85 | Element nextPageElement = doc.selectFirst("div.pagination > ul > li > a.next-page");
86 | if (nextPageElement != null) {
87 | String nextPageHref = BASE_URL + nextPageElement.attr("href");// 下一页地址
88 | if (nextPageHref.contains("/3") || nextPageHref.contains("p=3")) {// 为了测试只获取2页数据
89 | return viewUrls;
90 | }
91 | viewUrls.addAll(getViewUrls(nextPageHref));// 递规获取每一页中的播放地址
92 | }
93 | } catch (IOException e) {
94 | log.error("获取文档对象错误:[" + href + "]", e);
95 | }
96 |
97 | return viewUrls;
98 | }
99 |
100 | /**
101 | * 获取当前页中每个的播放地址
102 | *
103 | * @param doc
104 | * @return
105 | */
106 | public List getPageUrls(Document doc) {
107 | //Elements links = doc.select("div#content > div.mozaique > div div.thumb > a");
108 | Elements insides = doc.select("div#content > div.mozaique > div div.thumb-inside");
109 | if(insides == null || insides.size() == 0){
110 | return null;
111 | }
112 | List urls = new ArrayList<>();
113 | for(Element inside : insides){
114 | Element hdElement = inside.selectFirst("span.video-hd-mark");//获取高清(720p)视频
115 | if(hdElement != null){
116 | Element linkElement = inside.selectFirst("div.thumb > a");
117 | String link = linkElement.attr("href");
118 | urls.add(BASE_URL + link);
119 | }
120 | }
121 | return urls;
122 | }
123 |
124 | @Override
125 | public Map getCategories() {
126 | Map categories = new HashMap<>();
127 | categories.put("丝袜",BASE_URL+"/c/Stockings-28");
128 | categories.put("喷水",BASE_URL+"/c/Squirting-56");
129 | categories.put("女同",BASE_URL+"/?k=lesbian&quality=hd");
130 |
131 | return categories;
132 | }
133 |
134 | @Override
135 | public boolean downloadVideo(String dir, String videoUrl) {
136 | String fileName = StringUtils.substringAfterLast(videoUrl,"/");
137 | fileName = StringUtils.substringBefore(fileName, "?");
138 | String filePath = dir+File.separatorChar+fileName;
139 | try {
140 | new HttpDownloader().download(videoUrl, filePath);
141 | return true;
142 | } catch (IOException e) {
143 | e.printStackTrace();
144 | }
145 | return false;
146 | }
147 | }
148 |
--------------------------------------------------------------------------------
/xxx-spider/src/main/java/com/zsf/xxx/http/AbstractHttpViewer.java:
--------------------------------------------------------------------------------
1 | package com.zsf.xxx.http;
2 |
3 | import java.io.IOException;
4 | import java.time.Duration;
5 |
6 | import org.jsoup.nodes.Document;
7 |
8 | import okhttp3.OkHttpClient;
9 |
10 | /**
11 | * @author papapa
12 | *
13 | */
14 | public abstract class AbstractHttpViewer implements HttpViewer {
15 |
16 | public static final String USER_AGENT = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.132 Safari/537.36";
17 |
18 | @Override
19 | public String getResponesStr(String url) throws IOException {
20 | Document doc = getResponseDoc(url);
21 | if(doc != null){
22 | return doc.html();
23 | }
24 | return null;
25 | }
26 |
27 | public OkHttpClient.Builder getOkHttpClientBuilder(){
28 | OkHttpClient.Builder builder = new OkHttpClient.Builder()
29 | .followRedirects(true)
30 | .followSslRedirects(true)
31 | .retryOnConnectionFailure(true)
32 | .readTimeout(Duration.ofSeconds(10L))
33 | .connectTimeout(Duration.ofSeconds(10L))
34 | .readTimeout(Duration.ofSeconds(10L))
35 | .callTimeout(Duration.ofSeconds(10L));
36 | return builder;
37 | }
38 | }
39 |
--------------------------------------------------------------------------------
/xxx-spider/src/main/java/com/zsf/xxx/http/CorsAnywhereViewer.java:
--------------------------------------------------------------------------------
1 | package com.zsf.xxx.http;
2 |
3 | import java.io.IOException;
4 |
5 | import org.jsoup.Jsoup;
6 | import org.jsoup.nodes.Document;
7 | import org.slf4j.Logger;
8 | import org.slf4j.LoggerFactory;
9 |
10 | import okhttp3.Call;
11 | import okhttp3.OkHttpClient;
12 | import okhttp3.Request;
13 | import okhttp3.ResponseBody;
14 |
15 | /**
16 | * @author papapa
17 | *
18 | */
19 | public class CorsAnywhereViewer extends AbstractHttpViewer {
20 |
21 | private static final Logger log = LoggerFactory.getLogger(CorsAnywhereViewer.class);
22 |
23 | private static final String PROXY_URL = "https://cors-anywhere.herokuapp.com/";
24 |
25 | @Override
26 | public Document getResponseDoc(String url) throws IOException {
27 | log.info("获取页面:[{}]内容", url);
28 | OkHttpClient client = super.getOkHttpClientBuilder().build();
29 | Request request = new Request.Builder().url(PROXY_URL + url).header("x-requested-with", "XMLHttpRequest")
30 | .header("User-Agent", USER_AGENT).get().build();
31 | Call call = client.newCall(request);
32 | okhttp3.Response response = call.execute();
33 | int code = response.code();
34 | if (code == 200) {
35 | ResponseBody body = response.body();
36 | Document doc = Jsoup.parse(body.string());
37 | body.close();
38 | response.close();
39 | return doc;
40 | }
41 | return null;
42 | }
43 |
44 | }
45 |
--------------------------------------------------------------------------------
/xxx-spider/src/main/java/com/zsf/xxx/http/HttpDebugerViewer.java:
--------------------------------------------------------------------------------
1 | package com.zsf.xxx.http;
2 |
3 | import java.io.IOException;
4 | import java.util.HashMap;
5 | import java.util.Map;
6 |
7 | import org.apache.commons.text.StringEscapeUtils;
8 | import org.jsoup.Jsoup;
9 | import org.jsoup.nodes.Document;
10 | import org.slf4j.Logger;
11 | import org.slf4j.LoggerFactory;
12 |
13 | import okhttp3.Call;
14 | import okhttp3.FormBody;
15 | import okhttp3.Headers;
16 | import okhttp3.OkHttpClient;
17 | import okhttp3.Request;
18 | import okhttp3.RequestBody;
19 | import okhttp3.ResponseBody;
20 |
21 | /**
22 | * @author papapa
23 | *
24 | */
25 | public class HttpDebugerViewer extends AbstractHttpViewer {
26 |
27 | private static final Logger log = LoggerFactory.getLogger(HttpDebugerViewer.class);
28 |
29 | private static final String PROXY_URL = "http://www.httpdebugger.com/tools/ViewHttpHeaders.aspx";
30 |
31 | @Override
32 | public Document getResponseDoc(String url) throws IOException {
33 | log.info("获取页面:[{}]内容", url);
34 | OkHttpClient client = super.getOkHttpClientBuilder().build();
35 | Request request = new Request.Builder().url(PROXY_URL).headers(Headers.of(getHeaders())).post(getRequestBody(url)).build();
36 | Call call = client.newCall(request);
37 | okhttp3.Response response = call.execute();
38 | int code = response.code();
39 | if (code == 200) {
40 | ResponseBody body = response.body();
41 | Document doc = Jsoup.parse(body.string());
42 | body.close();
43 | response.close();
44 | String html = doc.selectFirst("div#ResultData pre").html();
45 | html = StringEscapeUtils.unescapeHtml4(html);
46 | return Jsoup.parse(html);
47 | }
48 | return null;
49 | }
50 |
51 | private RequestBody getRequestBody(String url){
52 | FormBody.Builder builder = new FormBody.Builder();
53 | builder.add("UrlBox", url);
54 | builder.add("AgentList", "Google Chrome");
55 | builder.add("VersionsList", "HTTP/1.1");
56 | builder.add("MethodList", "GET");
57 | return builder.build();
58 | }
59 |
60 | private Map getHeaders(){
61 | Map headers = new HashMap<>();
62 | headers.put("Host", "www.httpdebugger.com");
63 | headers.put("Origin", "http://www.httpdebugger.com");
64 | headers.put("Pragma", "no-cache");
65 | headers.put("Referer", "http://www.httpdebugger.com/tools/ViewHttpHeaders.aspx");
66 | headers.put("Upgrade-Insecure-Requests", "1");
67 | return headers;
68 | }
69 | }
70 |
--------------------------------------------------------------------------------
/xxx-spider/src/main/java/com/zsf/xxx/http/HttpViewer.java:
--------------------------------------------------------------------------------
1 | package com.zsf.xxx.http;
2 |
3 | import java.io.IOException;
4 |
5 | import org.apache.commons.lang3.RandomUtils;
6 | import org.jsoup.nodes.Document;
7 |
8 | /**
9 | * @author papapa
10 | *
11 | */
12 | public interface HttpViewer {
13 |
14 | /**
15 | * 获取url响应后的html内容
16 | * @param url
17 | * @return
18 | * @throws IOException
19 | */
20 | String getResponesStr(String url) throws IOException;
21 |
22 | /**
23 | * 获取url响应后的文档对像
24 | * @param url
25 | * @return
26 | * @throws IOException
27 | */
28 | Document getResponseDoc(String url) throws IOException;
29 |
30 | /**
31 | * 获取httpViewer的一个随机实例
32 | * @return
33 | */
34 | public static HttpViewer getRandomInstance(){
35 | HttpViewer[] VIEWERS = new HttpViewer[]{new CorsAnywhereViewer(),new HttpDebugerViewer(),new JsonpAfeldViewer()};
36 | int index = RandomUtils.nextInt(0, 2);
37 | return VIEWERS[index];
38 | }
39 | }
40 |
--------------------------------------------------------------------------------
/xxx-spider/src/main/java/com/zsf/xxx/http/JsonpAfeldViewer.java:
--------------------------------------------------------------------------------
1 | package com.zsf.xxx.http;
2 |
3 | import java.io.IOException;
4 |
5 | import org.jsoup.Jsoup;
6 | import org.jsoup.nodes.Document;
7 | import org.slf4j.Logger;
8 | import org.slf4j.LoggerFactory;
9 |
10 | import okhttp3.Call;
11 | import okhttp3.OkHttpClient;
12 | import okhttp3.Request;
13 | import okhttp3.ResponseBody;
14 |
15 | /**
16 | * @author papapa
17 | *
18 | */
19 | public class JsonpAfeldViewer extends AbstractHttpViewer {
20 |
21 | private static final Logger log = LoggerFactory.getLogger(JsonpAfeldViewer.class);
22 |
23 | private static final String PROXY_URL = "https://jsonp.afeld.me/?url=";
24 |
25 | @Override
26 | public Document getResponseDoc(String url) throws IOException {
27 | log.info("获取页面:[{}]内容", url);
28 | OkHttpClient client = super.getOkHttpClientBuilder().build();
29 | Request request = new Request.Builder().url(PROXY_URL + url).header("User-Agent", USER_AGENT).get().build();
30 | Call call = client.newCall(request);
31 | okhttp3.Response response = call.execute();
32 | int code = response.code();
33 | if (code == 200) {
34 | ResponseBody body = response.body();
35 | Document doc = Jsoup.parse(body.string());
36 | body.close();
37 | response.close();
38 | return doc;
39 | }
40 | return null;
41 | }
42 |
43 | }
44 |
--------------------------------------------------------------------------------
/xxx-spider/src/main/java/com/zsf/xxx/util/BatchRecursiveAction.java:
--------------------------------------------------------------------------------
1 | package com.zsf.xxx.util;
2 |
3 | import java.lang.reflect.Constructor;
4 | import java.lang.reflect.InvocationTargetException;
5 | import java.util.ArrayList;
6 | import java.util.List;
7 | import java.util.concurrent.RecursiveAction;
8 |
9 | /**
10 | * @author papapa
11 | *
12 | */
13 | @SuppressWarnings("rawtypes")
14 | public abstract class BatchRecursiveAction extends RecursiveAction{
15 |
16 | private static final long serialVersionUID = -2909644333830555865L;
17 |
18 | private List items;
19 |
20 | private Object ext;
21 |
22 | protected BatchRecursiveAction(List items,Object ext){
23 | this.items = items;
24 | this.ext = ext;
25 | }
26 |
27 | @SuppressWarnings({ "unchecked" })
28 | @Override
29 | protected void compute() {
30 | if(items != null && items.size() > 0){
31 | if(items.size() == 1){
32 | computeItem(items.get(0));
33 | }else{
34 | List actions = new ArrayList<>();
35 | for(Object item : items){
36 | List subActions = new ArrayList<>();
37 | subActions.add(item);
38 | BatchRecursiveAction subBatchRecursiveAction = null;
39 | try {
40 | Constructor constructor = this.getClass().getDeclaredConstructors()[0];
41 | constructor.setAccessible(true);
42 | subBatchRecursiveAction = (BatchRecursiveAction) constructor.newInstance(subActions,this.ext);
43 | actions.add(subBatchRecursiveAction);
44 | } catch (InstantiationException | IllegalAccessException | IllegalArgumentException
45 | | InvocationTargetException | SecurityException e) {
46 | e.printStackTrace();
47 | }
48 | }
49 | invokeAll(actions);
50 | for(BatchRecursiveAction action : actions){
51 | action.join();
52 | }
53 | }
54 | }
55 | }
56 |
57 | public abstract void computeItem(Object item);
58 | }
59 |
--------------------------------------------------------------------------------
/xxx-spider/src/main/java/com/zsf/xxx/util/BatchRecursiveTask.java:
--------------------------------------------------------------------------------
1 | package com.zsf.xxx.util;
2 |
3 | import java.lang.reflect.Constructor;
4 | import java.lang.reflect.InvocationTargetException;
5 | import java.lang.reflect.Type;
6 | import java.util.ArrayList;
7 | import java.util.Arrays;
8 | import java.util.List;
9 | import java.util.concurrent.RecursiveTask;
10 |
11 | /**
12 | * @author papapa
13 | *
14 | */
15 | @SuppressWarnings("rawtypes")
16 | public abstract class BatchRecursiveTask extends RecursiveTask{
17 |
18 | private static final long serialVersionUID = 2119394771132854398L;
19 |
20 | private List items;
21 |
22 | private Object ext;
23 |
24 | protected BatchRecursiveTask(List items,Object ext){
25 | this.items = items;
26 | this.ext = ext;
27 | }
28 |
29 | public Object getExt(){
30 | return this.ext;
31 | }
32 |
33 | @SuppressWarnings({ "unchecked" })
34 | @Override
35 | protected List compute() {
36 | List values = new ArrayList<>();
37 | if(items != null && items.size() > 0){
38 | if(items.size() == 1){
39 | values.add(computeItem(items.get(0)));
40 | }else{
41 | BatchRecursiveTask[] tasks = new BatchRecursiveTask[items.size()];
42 | int index = 0;
43 | Constructor constructor = this.getClass().getDeclaredConstructors()[0];
44 | constructor.setAccessible(true);
45 | Type[] types = constructor.getGenericParameterTypes();//匿名内部内有多个参数
46 | Object[] initargs = new Object[types.length];
47 | for(Object item : items){
48 | try {
49 | initargs[types.length-1] = this.ext;
50 | initargs[types.length-2] = Arrays.asList(item);
51 | tasks[index] = (BatchRecursiveTask) constructor.newInstance(initargs);
52 | index++;
53 | } catch (InstantiationException | IllegalAccessException | IllegalArgumentException
54 | | InvocationTargetException | SecurityException e) {
55 | e.printStackTrace();
56 | }
57 | }
58 | invokeAll(tasks);
59 | for(BatchRecursiveTask task : tasks){
60 | values.addAll(task.join());
61 | }
62 | }
63 | }
64 | return values;
65 | }
66 | public abstract Object computeItem(Object item);
67 | }
--------------------------------------------------------------------------------
/xxx-spider/src/main/java/com/zsf/xxx/util/DownloadBatchRecursiveTask.java:
--------------------------------------------------------------------------------
1 | package com.zsf.xxx.util;
2 |
3 | import java.io.File;
4 | import java.io.FileOutputStream;
5 | import java.io.InputStream;
6 | import java.io.OutputStream;
7 | import java.net.HttpURLConnection;
8 | import java.net.SocketTimeoutException;
9 | import java.net.URL;
10 | import java.util.List;
11 | import java.util.concurrent.TimeUnit;
12 |
13 | import javax.net.ssl.SSLException;
14 |
15 | import org.apache.commons.io.FileUtils;
16 | import org.apache.commons.io.IOUtils;
17 |
18 | import lombok.extern.slf4j.Slf4j;
19 |
20 | /**
21 | * @author papapa
22 | *
23 | */
24 | @Slf4j
25 | public class DownloadBatchRecursiveTask extends BatchRecursiveTask {
26 |
27 | private static final long serialVersionUID = -6577872267278165509L;
28 |
29 | private static final int RETRY_TIMES = 10;// 重试次数
30 |
31 | private static final int CONNECTION_TIMEOUT = 10 * 1000;//连接超时时间10秒
32 |
33 | private static final int READ_TIMEOUT = 10 * 1000;//读超时时间10秒
34 |
35 | protected DownloadBatchRecursiveTask(List> items, Object ext) {
36 | super(items, ext);
37 | }
38 |
39 | @Override
40 | public Object computeItem(Object item) {
41 | FileItemInfo itemInfo = (FileItemInfo) item;
42 | String filePath = itemInfo.getFilePath();
43 | File file = new File(filePath);
44 | long downloadLength = getDownloadLength(itemInfo);
45 | if (file.exists()) {
46 | if (file.length() == downloadLength) {
47 | log.info("分片[{}]已下载成功",itemInfo.getPartIndex());
48 | itemInfo.setDownloadSuccess(true);
49 | return itemInfo;
50 | }
51 | }
52 | downloadPart(itemInfo, RETRY_TIMES);
53 | return itemInfo;
54 | }
55 |
56 | private FileItemInfo downloadPart(FileItemInfo itemInfo, int retryTimes) {
57 | String filePath = itemInfo.getFilePath();
58 | int partIndex = itemInfo.getPartIndex();
59 |
60 | String url = itemInfo.getUrl();
61 | String range = getRange(itemInfo);
62 | InputStream input = null;
63 | OutputStream output = null;
64 | if(range == null){
65 | itemInfo.setDownloadSuccess(true);
66 | log.info("分片[{}]下存在,无需下载",itemInfo.getPartIndex());
67 | return itemInfo;
68 | }
69 | if (retryTimes == 0) {
70 | log.error("分片[{}]重试5次后依旧下载失败",partIndex);
71 | return null;
72 | }
73 | log.info("分片[{}],range:{},文件:{}",partIndex, range,filePath);
74 | HttpURLConnection connection = null;
75 | try {
76 | connection = (HttpURLConnection) new URL(url).openConnection();
77 | connection.setConnectTimeout(CONNECTION_TIMEOUT);
78 | connection.setReadTimeout(READ_TIMEOUT);
79 |
80 | HttpURLConnection.setFollowRedirects(true);
81 | connection.setRequestProperty("range", "bytes=" + range);
82 | connection.connect();
83 | // 获取响应吗
84 | int responseCode = connection.getResponseCode();
85 | if (responseCode != 206 && responseCode != 200) {
86 | return null;
87 | }
88 | input = connection.getInputStream();
89 | output = new FileOutputStream(filePath, true);
90 | IOUtils.copy(input, output, 1024);
91 | output.flush();
92 | connection.disconnect();
93 | itemInfo.setDownloadSuccess(true);
94 | log.info("分片[{}]下载成功",partIndex);
95 | } catch (Exception e) {
96 | if(!(e instanceof SocketTimeoutException) && !(e instanceof SSLException)){
97 | log.error("分片["+partIndex+"]下载失败:",e);
98 | }
99 | retryTimes--;
100 | log.error("分片[{}]开始重试第[{}]次",partIndex,RETRY_TIMES - retryTimes);
101 | try {
102 | TimeUnit.SECONDS.sleep(1L);
103 | } catch (InterruptedException e1) {
104 | e1.printStackTrace();
105 | }
106 | if(connection != null){
107 | connection.disconnect();
108 | }
109 | IOUtils.closeQuietly(input);
110 | IOUtils.closeQuietly(output);
111 |
112 | downloadPart(itemInfo, retryTimes);
113 | }
114 | return itemInfo;
115 | }
116 |
117 | /**
118 | * 要下载的长度
119 | * @param itemInfo
120 | * @return
121 | */
122 | private long getDownloadLength(FileItemInfo itemInfo){
123 | long downloadLength = itemInfo.getEndIndex() - itemInfo.getStartIndex() + (itemInfo.isLastSharding() ? 0 : 1);
124 | return downloadLength;
125 | }
126 |
127 | /**
128 | * 获取range
129 | * @param itemInfo
130 | * @return 当返回null时表示不需要再重新下载
131 | */
132 | private String getRange(FileItemInfo itemInfo){
133 | String filePath = itemInfo.getFilePath();
134 | File tempFile = new File(filePath);
135 | FileInfo fileInfo = itemInfo.getFileInfo();
136 | long contentLength = fileInfo.getContentLength();
137 |
138 | long tempContentLength = tempFile.exists() ? FileUtils.sizeOf(tempFile) : 0;
139 | log.info("临时文件[{}]大小{}Bytes",tempFile.getAbsolutePath(),tempContentLength);
140 | String range = itemInfo.getStartIndex() + "-";
141 | if(tempContentLength == 0){
142 | if (itemInfo.getEndIndex() != contentLength) {
143 | range += itemInfo.getEndIndex();
144 | }
145 | }else{
146 | if (getDownloadLength(itemInfo) == tempContentLength) {
147 | range = null;
148 | }else{
149 | range = itemInfo.getStartIndex() + tempContentLength + "-"+ itemInfo.getEndIndex();
150 | }
151 | }
152 | return range;
153 | }
154 | }
155 |
--------------------------------------------------------------------------------
/xxx-spider/src/main/java/com/zsf/xxx/util/FileInfo.java:
--------------------------------------------------------------------------------
1 | package com.zsf.xxx.util;
2 |
3 | import lombok.Getter;
4 | import lombok.Setter;
5 | import lombok.experimental.Accessors;
6 |
7 | /**
8 | * @author papapa
9 | *
10 | */
11 | @Getter
12 | @Setter
13 | @Accessors(chain = true)
14 | public class FileInfo {
15 |
16 | private String url;//文件原始下载地址
17 |
18 | private String filePath;//文件保存路径
19 |
20 | private long contentLength;// 内容长度
21 |
22 | private String contentType;// 内容类型
23 |
24 | private long lastModified;// 最后修改时间
25 |
26 | private boolean suportSharding;//是否支持分片
27 |
28 | }
29 |
--------------------------------------------------------------------------------
/xxx-spider/src/main/java/com/zsf/xxx/util/FileItemInfo.java:
--------------------------------------------------------------------------------
1 | package com.zsf.xxx.util;
2 |
3 | import lombok.Getter;
4 | import lombok.Setter;
5 | import lombok.experimental.Accessors;
6 |
7 | /**
8 | * @author papapa
9 | *
10 | */
11 | @Getter
12 | @Setter
13 | @Accessors(chain = true)
14 | public class FileItemInfo implements Comparable {
15 |
16 | private FileInfo fileInfo;
17 |
18 | private String url;// 下载地址
19 |
20 | private String filePath;// 文件名
21 |
22 | private int partIndex;// 分片位置
23 |
24 | private long startIndex;// 文件开始下载位置
25 |
26 | private long endIndex;// 文件结束位置
27 |
28 | private boolean downloadSuccess;//是否下载成功
29 |
30 | private boolean firstSharding;//是否首片
31 |
32 | private boolean lastSharding;//是否最后一片
33 |
34 | @Override
35 | public int compareTo(FileItemInfo o) {
36 | int partIndex1 = this.getPartIndex();
37 | int partIndex2 = o.getPartIndex();
38 | if (partIndex1 < partIndex2) {
39 | return -1;
40 | } else if (partIndex1 > partIndex2) {
41 | return 1;
42 | } else {
43 | return 0;
44 | }
45 | }
46 | }
47 |
--------------------------------------------------------------------------------
/xxx-spider/src/main/java/com/zsf/xxx/util/HttpDownloader.java:
--------------------------------------------------------------------------------
1 | package com.zsf.xxx.util;
2 |
3 | import java.io.File;
4 | import java.io.IOException;
5 | import java.io.RandomAccessFile;
6 | import java.net.HttpURLConnection;
7 | import java.net.URL;
8 | import java.util.ArrayList;
9 | import java.util.Collections;
10 | import java.util.List;
11 |
12 | import org.apache.commons.codec.digest.DigestUtils;
13 | import org.apache.commons.io.FileUtils;
14 | import org.apache.commons.lang3.builder.ToStringBuilder;
15 | import org.apache.commons.lang3.builder.ToStringStyle;
16 |
17 | import lombok.extern.slf4j.Slf4j;
18 |
19 | /**
20 | * @author papapa
21 | *
22 | */
23 | @Slf4j
24 | public class HttpDownloader {
25 |
26 | private static final String TMP_PATH = FileUtils.getTempDirectoryPath();// 临时文件保存目录
27 |
28 | private static final int CONNECTION_TIMEOUT = 10 * 1000;//连接超时时间10秒
29 |
30 | private static final int READ_TIMEOUT = 10 * 1000;//读超时时间10秒
31 |
32 |
33 | public void download(String url, String filePath) throws IOException {
34 | FileInfo fileInfo = getFileInfo(url);
35 | if (fileInfo == null) {
36 | throw new IOException("获取文件信息失败:["+url+"]");
37 | }
38 | fileInfo.setFilePath(filePath);
39 | log.info("文件信息:{}",ToStringBuilder.reflectionToString(fileInfo, ToStringStyle.JSON_STYLE));
40 |
41 | File savedFile = new File(filePath);
42 | if (savedFile.exists()) {
43 | if (FileUtils.sizeOf(savedFile) == fileInfo.getContentLength()) {
44 | log.info("文件已下载完成:[{}]", filePath);
45 | return;
46 | }
47 | FileUtils.forceDelete(savedFile);
48 | }
49 | savedFile.createNewFile();
50 | if(fileInfo.isSuportSharding()){//支持分片下载
51 | long contentLength = fileInfo.getContentLength();
52 | int threadNum = Runtime.getRuntime().availableProcessors() * 2;
53 | if(contentLength <= threadNum){
54 | threadNum = 1;
55 | }
56 | multiThreadsDownload(fileInfo, threadNum);
57 | }else{
58 | singleDownload(fileInfo);
59 | }
60 | }
61 |
62 | /**
63 | * 单线程下载
64 | * @param fileInfo
65 | */
66 | private void singleDownload(FileInfo fileInfo) {
67 |
68 | }
69 |
70 | /**
71 | * 开始下载文件
72 | * @param fileInfo
73 | * @param threadNum
74 | * @throws IOException
75 | */
76 | private void multiThreadsDownload(FileInfo fileInfo, int threadNum) throws IOException {
77 | long contentLength = fileInfo.getContentLength();
78 | if(contentLength <= threadNum){//内容长度小于等于线程数时,只需要一个线程下载
79 | threadNum = 1;
80 | }
81 | List fileItemInfos = new ArrayList<>();
82 |
83 | long partSize = contentLength / threadNum + 1;
84 |
85 | log.info("每片大小[{}]Bytes",partSize);
86 | for (int i = 0; i < threadNum; i++) {
87 | long startIndex = i * partSize;
88 | startIndex = i == 0 ? 0 : fileItemInfos.get(i-1).getEndIndex()+1L;
89 |
90 | long endIndex = startIndex + partSize -1 ;
91 |
92 | if (i == threadNum - 1) {
93 | endIndex = fileInfo.getContentLength();
94 | }
95 | log.info("分片[{}]区间:{}-{},大小:{}",i+1,startIndex,endIndex,endIndex-startIndex+1);
96 | FileItemInfo fileItemInfo = new FileItemInfo();
97 | fileItemInfo.setPartIndex(i+1);
98 | fileItemInfo.setUrl(fileInfo.getUrl());
99 | fileItemInfo.setStartIndex(startIndex);
100 | fileItemInfo.setFilePath(getTmpFilePath(fileInfo, i+1));
101 | fileItemInfo.setEndIndex(endIndex);
102 | fileItemInfo.setFileInfo(fileInfo);
103 | fileItemInfo.setFirstSharding(i==0);
104 | fileItemInfo.setLastSharding(i == threadNum - 1);
105 | fileItemInfos.add(fileItemInfo);
106 | }
107 |
108 | @SuppressWarnings("unchecked")
109 | List results = (List) ParallelComputeUtil.compute(new DownloadBatchRecursiveTask(fileItemInfos, null));
110 | mergeFile(results);
111 |
112 | }
113 |
114 | /**
115 | * 合并文件
116 | * @param results
117 | * @throws IOException
118 | */
119 | private void mergeFile(List results) throws IOException {
120 | if(results == null){
121 | log.info("下载失败");
122 | return;
123 | }
124 | Collections.sort(results);
125 | for(FileItemInfo itemInfo : results){
126 | if(!itemInfo.isDownloadSuccess()){
127 | log.error("分片[{}]下载失败,取消合并文件",itemInfo.getPartIndex());
128 | return;
129 | }
130 | }
131 | addPartIntoFile(results);
132 | String filePath = results.get(0).getFileInfo().getFilePath();
133 | log.info("文件[{}]下载完成,大小:{}Bytes",filePath,FileUtils.sizeOf(new File(filePath)));
134 | deleteTempFile(results);
135 | }
136 |
137 | /**
138 | * 删除临时文件
139 | * @param items
140 | */
141 | private void deleteTempFile(List items){
142 | for(FileItemInfo item : items){
143 | boolean flag = FileUtils.deleteQuietly(new File(item.getFilePath()));
144 | log.info("临时文件[{}]删除{}",item.getFilePath(),flag ? "成功" : "失败");
145 | }
146 | }
147 | /**
148 | * 将分片下载的文件写入主文件
149 | * @param itemInfo
150 | * @throws IOException
151 | */
152 | private void addPartIntoFile(List itemInfos) throws IOException{
153 | String filePath = itemInfos.get(0).getFileInfo().getFilePath();
154 | RandomAccessFile file = new RandomAccessFile(new File(filePath), "rw");
155 | for(FileItemInfo itemInfo : itemInfos){
156 | file.seek(itemInfo.getStartIndex());
157 | File tempFile = new File(itemInfo.getFilePath());
158 | file.write(FileUtils.readFileToByteArray(tempFile));
159 | }
160 | file.close();
161 | }
162 |
163 | /**
164 | * 获取文件信息
165 | * @param url
166 | * @return
167 | * @throws IOException
168 | */
169 | private FileInfo getFileInfo(String url) throws IOException {
170 | HttpURLConnection connection = (HttpURLConnection) new URL(url).openConnection();
171 | connection.setConnectTimeout(CONNECTION_TIMEOUT);
172 | connection.setReadTimeout(READ_TIMEOUT);
173 | HttpURLConnection.setFollowRedirects(true);
174 | connection.connect();
175 | // 获取响应吗
176 | int responseCode = connection.getResponseCode();
177 | log.info("[{}]响应码为:{}", url, responseCode);
178 | if (responseCode != 200) {
179 | return null;
180 | }
181 | boolean suportSharding = false;
182 | if("bytes".equals(connection.getHeaderField("Accept-Ranges"))){
183 | suportSharding = true;
184 | }
185 |
186 | String contentType = connection.getContentType();
187 | long contentLength = connection.getContentLengthLong();
188 | long lastModified = connection.getLastModified();
189 | connection.disconnect();
190 |
191 | FileInfo fileInfo = new FileInfo().setUrl(url).setContentLength(contentLength).setContentType(contentType)
192 | .setLastModified(lastModified).setSuportSharding(suportSharding);
193 |
194 | return fileInfo;
195 | }
196 |
197 | /**
198 | * 获取临时文件路径
199 | *
200 | * @param url
201 | * @return
202 | */
203 | private String getTmpFilePath(FileInfo fileInfo, int partIndex) {
204 | String contentType = fileInfo.getContentType();
205 | long contentLength = fileInfo.getContentLength();
206 | long lastModified = fileInfo.getLastModified();
207 | return TMP_PATH+DigestUtils.md5Hex(contentType + contentLength + lastModified) + ".tmp_" + partIndex;
208 | //return StringUtils.substringBeforeLast(filePath, File.separator)+File.separator+DigestUtils.md5Hex(url + filePath + contentType + contentLength + lastModified) + ".tmp_" + partIndex;
209 | }
210 | }
211 |
--------------------------------------------------------------------------------
/xxx-spider/src/main/java/com/zsf/xxx/util/ParallelComputeUtil.java:
--------------------------------------------------------------------------------
1 | package com.zsf.xxx.util;
2 |
3 | import java.util.List;
4 | import java.util.concurrent.ForkJoinPool;
5 | import java.util.concurrent.RecursiveAction;
6 | import java.util.concurrent.RecursiveTask;
7 |
8 | /**
9 | * @author papapa 并发调用工具类
10 | */
11 | public class ParallelComputeUtil {
12 |
13 | public static V compute(RecursiveTask recursiveTask) {
14 | ForkJoinPool forkJoinPool = new ForkJoinPool();
15 | V value = forkJoinPool.invoke(recursiveTask);
16 | forkJoinPool.shutdown();
17 | return value;
18 | }
19 |
20 | public static List> compute(BatchRecursiveTask recursiveTask) {
21 | ForkJoinPool forkJoinPool = new ForkJoinPool(Runtime.getRuntime().availableProcessors() * 2);
22 | List> value = forkJoinPool.invoke(recursiveTask);
23 | forkJoinPool.shutdown();
24 | return value;
25 | }
26 |
27 | public static void execute(RecursiveAction recursiveAction) {
28 | ForkJoinPool forkJoinPool = new ForkJoinPool();
29 | forkJoinPool.execute(recursiveAction);
30 | forkJoinPool.shutdown();
31 | }
32 |
33 | public static void execute(BatchRecursiveAction recursiveAction) {
34 | ForkJoinPool forkJoinPool = new ForkJoinPool();
35 | forkJoinPool.execute(recursiveAction);
36 | forkJoinPool.shutdown();
37 | }
38 | }
39 |
--------------------------------------------------------------------------------
/xxx-spider/src/main/resources/LICENSE:
--------------------------------------------------------------------------------
1 | 技术无罪,请把你的“技术”应用到实战中!
--------------------------------------------------------------------------------
/xxx-spider/src/main/resources/log4j2.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 |
13 |
14 |
--------------------------------------------------------------------------------
/xxx-spider/src/test/java/com/zsf/xxx/JunitTest.java:
--------------------------------------------------------------------------------
1 | package com.zsf.xxx;
2 |
3 | import java.io.File;
4 | import java.io.FileInputStream;
5 | import java.io.FileNotFoundException;
6 | import java.io.FileOutputStream;
7 | import java.io.IOException;
8 | import java.io.InputStream;
9 | import java.net.HttpURLConnection;
10 | import java.net.URL;
11 | import java.util.HashMap;
12 | import java.util.Map;
13 |
14 | import org.apache.commons.io.IOUtils;
15 | import org.apache.commons.lang3.StringUtils;
16 | import org.apache.tika.exception.TikaException;
17 | import org.apache.tika.metadata.Metadata;
18 | import org.apache.tika.parser.AutoDetectParser;
19 | import org.apache.tika.sax.BodyContentHandler;
20 | import org.jsoup.nodes.Document;
21 | import org.jsoup.select.Elements;
22 | import org.junit.Test;
23 | import org.xml.sax.SAXException;
24 |
25 | import com.alibaba.fastjson.JSON;
26 | import com.alibaba.fastjson.JSONPath;
27 | import com.zsf.xxx.http.HttpDebugerViewer;
28 | import com.zsf.xxx.http.HttpViewer;
29 | import com.zsf.xxx.http.JsonpAfeldViewer;
30 | import com.zsf.xxx.util.HttpDownloader;
31 |
32 | import lombok.extern.slf4j.Slf4j;
33 |
34 | /**
35 | * @author papapa
36 | *
37 | */
38 | @Slf4j
39 | public class JunitTest {
40 |
41 | @Test
42 | public void testGetVideoUrl() throws IOException {
43 | String viewUrl = "https://cn.pornhub.com/view_video.php?viewkey=ph5d56b96c279c8";
44 | Document doc = HttpViewer.getRandomInstance().getResponseDoc(viewUrl);
45 | String scriptStr = doc.selectFirst("div#player > script").toString();
46 | scriptStr = "{" + StringUtils.substringBetween(scriptStr, "{", "};") + "}";
47 |
48 | // JSONObject json = JSON.parseObject(scriptStr,JSONObject.class);
49 | String videoUrl = (String) JSONPath.eval(scriptStr,
50 | "$.mediaDefinitions[quality='720'][format='mp4'][0].videoUrl");// 获取720p且为mp4格式的地址
51 |
52 | System.out.println(videoUrl);
53 | }
54 |
55 | public static void main(String[] args) throws IOException {
56 | //String url = "https://vid3-l3.xvideos-cdn.com/videos/mp4/2/1/3/xvideos.com_213d9cdeb355e88c4ac217af62911445.mp4?e=1568258415&ri=1024&rs=85&h=b6b50269fb96d71fb8577e0ac76ca7bb";
57 | String url = "https://vid1-l3.xvideos-cdn.com/videos/mp4/9/7/0/xvideos.com_9707facedd27ab26bfed74db83328504.mp4?e=1568822783&ri=1024&rs=85&h=42d19ed678f5cf1fa3be0594e63a54d1";
58 | //String url = "https://ardownload2.adobe.com/pub/adobe/reader/mac/AcrobatDC/1901220034/AcroRdrDC_1901220034_MUI.dmg";
59 | String filePath = "/Users/zsf/git/xvidoes/mp4/fontawesome-free-5.9.0-web.zip";
60 | long startTime = System.currentTimeMillis();
61 | new HttpDownloader().download(url, filePath);
62 | log.info("共耗时:{}秒",System.currentTimeMillis() - startTime);
63 | }
64 |
65 | @Test
66 | public void test2() throws IOException{
67 | String url = "https://pcs.baidu.com/rest/2.0/pcs/file?method=download&path=%2F%E7%94%B5%E5%BD%B1%2F%E4%BE%8F%E7%BD%97%E7%BA%AA%E5%85%AC%E5%9B%AD%E4%B8%89%E9%83%A8%E6%9B%B2%2FJurassic.Park.1.1993.mkv&random=0.7139769215592586&app_id=498065";
68 | String filePath = "/Users/zsf/git/xvidoes/mp4/test.mkv";
69 | HttpURLConnection connection = (HttpURLConnection) new URL(url).openConnection();
70 | connection.setConnectTimeout(6 * 1000);// 超时6秒重连
71 | connection.setReadTimeout(6 * 1000);
72 |
73 | HttpURLConnection.setFollowRedirects(true);
74 | connection.setRequestProperty("range", "bytes=10-");
75 | connection.connect();
76 | // 获取响应吗
77 | int responseCode = connection.getResponseCode();
78 | if (responseCode != 206 && responseCode != 200) {
79 | return;
80 | }
81 | InputStream input = connection.getInputStream();
82 | IOUtils.copy(input, new FileOutputStream(new File(filePath)),1024);
83 | connection.disconnect();
84 | }
85 |
86 | @Test
87 | public void testHttpDebugerViewer() throws IOException{
88 | Document doc = new HttpDebugerViewer().getResponseDoc("https://www.xvideos.com/c/Squirting-56?quality=hd");
89 | Elements insides = doc.select("div#content > div.mozaique > div div.thumb-inside");
90 | log.info(insides.html());
91 | }
92 |
93 | @Test
94 | public void testJsonpAfeldViewer() throws IOException{
95 | Document doc = new JsonpAfeldViewer().getResponseDoc("https://www.xvideos.com/c/Squirting-56?quality=hd");
96 | Elements insides = doc.select("div#content > div.mozaique > div div.thumb-inside");
97 | log.info(insides.html());
98 | }
99 |
100 | @Test
101 | public void testTika() throws FileNotFoundException {
102 | InputStream input = new FileInputStream(new File("/Users/zsf/Desktop/2019年秋学籍模板.xls"));
103 | Map metaData = getMetaData(input);
104 | log.info(JSON.toJSONString(metaData, true));
105 | }
106 |
107 | private Map getMetaData(InputStream input){
108 | Map metadataMap = null;
109 | BodyContentHandler handler = new BodyContentHandler();
110 | Metadata metadata = new Metadata();
111 | AutoDetectParser parser = new AutoDetectParser();
112 | try {
113 | parser.parse(new FileInputStream(new File("/Users/zsf/Desktop/2019年秋学籍模板.xls")), handler, metadata);
114 | metadata.add("downloadUrl", "https://www.baidu.com");
115 | metadataMap = new HashMap<>();
116 | String[] metadataNames = metadata.names();
117 | for(String metaName : metadataNames){
118 | metadataMap.put(metaName,metadata.get(metaName));
119 | }
120 | } catch (IOException | SAXException | TikaException e) {
121 | e.printStackTrace();
122 | }
123 | return metadataMap;
124 | }
125 | }
126 |
--------------------------------------------------------------------------------