├── README.md └── xxx-spider ├── .gitignore ├── pom.xml └── src ├── main ├── java │ └── com │ │ └── zsf │ │ └── xxx │ │ ├── AbstractCrawler.java │ │ ├── Crawler.java │ │ ├── Launcher.java │ │ ├── PornhubCrawler.java │ │ ├── XvideosCrawler.java │ │ ├── http │ │ ├── AbstractHttpViewer.java │ │ ├── CorsAnywhereViewer.java │ │ ├── HttpDebugerViewer.java │ │ ├── HttpViewer.java │ │ └── JsonpAfeldViewer.java │ │ └── util │ │ ├── BatchRecursiveAction.java │ │ ├── BatchRecursiveTask.java │ │ ├── DownloadBatchRecursiveTask.java │ │ ├── FileInfo.java │ │ ├── FileItemInfo.java │ │ ├── HttpDownloader.java │ │ └── ParallelComputeUtil.java └── resources │ ├── LICENSE │ └── log4j2.xml └── test └── java └── com └── zsf └── xxx └── JunitTest.java /README.md: -------------------------------------------------------------------------------- 1 | # xxx 2 | 不用梯子获取pornhub xvideos的视频地址。获取到地址后可直接用迅雷批量下载视频文件,也可用VLC在线播放视频文件。 3 | -------------------------------------------------------------------------------- /xxx-spider/.gitignore: -------------------------------------------------------------------------------- 1 | /target/ 2 | /.classpath 3 | /.project 4 | /.settings/ 5 | -------------------------------------------------------------------------------- /xxx-spider/pom.xml: -------------------------------------------------------------------------------- 1 | 3 | 4.0.0 4 | com.zsf.xxx 5 | xxx-spider 6 | 1.0.0 7 | 8 | 9 | commons-io 10 | commons-io 11 | 2.5 12 | 13 | 14 | org.apache.commons 15 | commons-lang3 16 | 3.9 17 | 18 | 19 | org.apache.commons 20 | commons-text 21 | 1.8 22 | 23 | 24 | com.squareup.okhttp3 25 | okhttp 26 | 4.1.0 27 | 28 | 29 | org.jsoup 30 | jsoup 31 | 1.12.1 32 | 33 | 34 | org.apache.logging.log4j 35 | log4j-slf4j-impl 36 | 2.12.1 37 | 38 | 39 | com.alibaba 40 | fastjson 41 | 1.2.60 42 | 43 | 44 | junit 45 | junit 46 | 4.12 47 | test 48 | 49 | 50 | org.projectlombok 51 | lombok 52 | 1.18.8 53 | provided 54 | 55 | 56 | commons-codec 57 | commons-codec 58 | 1.13 59 | 60 | 61 | org.apache.tika 62 | tika-parsers 63 | 1.22 64 | 65 | 66 | 67 | 68 | 69 | org.apache.maven.plugins 70 | maven-assembly-plugin 71 | 3.1.1 72 | 73 | 74 | 75 | true 76 | com.zsf.xxx.Launcher 77 | 78 | true 79 | 80 | 81 | jar-with-dependencies 82 | 83 | UTF-8 84 | 85 | 86 | 87 | assemble-all 88 | package 89 | 90 | single 91 | 92 | 93 | 94 | 95 | 96 | 97 | -------------------------------------------------------------------------------- /xxx-spider/src/main/java/com/zsf/xxx/AbstractCrawler.java: -------------------------------------------------------------------------------- 1 | package com.zsf.xxx; 2 | 3 | import java.io.File; 4 | import java.io.IOException; 5 | import java.util.ArrayList; 6 | import java.util.List; 7 | import java.util.Map; 8 | 9 | import org.apache.commons.io.FileUtils; 10 | import org.apache.commons.lang3.StringUtils; 11 | import org.slf4j.Logger; 12 | import org.slf4j.LoggerFactory; 13 | 14 | /** 15 | * @author papapa 16 | * 17 | */ 18 | public abstract class AbstractCrawler implements Crawler{ 19 | 20 | private static final Logger log = LoggerFactory.getLogger(AbstractCrawler.class); 21 | 22 | @Override 23 | public void execute(String dir) throws IOException { 24 | if (StringUtils.isBlank(dir)) { 25 | throw new IOException("文件保存目录为空"); 26 | } 27 | Map categories = getCategories(); 28 | if (categories != null) { 29 | for (Map.Entry entry : categories.entrySet()) { 30 | String title = entry.getKey(); 31 | String href = entry.getValue(); 32 | List viewUrls = getViewUrls(href); 33 | log.info("获取所有[{}]播放地址成功:{}", title, viewUrls); 34 | List videoUrls = getVideoUrls(viewUrls); 35 | log.info("获取所有[{}]视频地址成功:{}", title, videoUrls); 36 | 37 | String downloadFileName = dir + File.separatorChar + title + File.separatorChar + "download" + ".txt"; 38 | FileUtils.writeLines(new File(downloadFileName),"UTF-8", videoUrls, false); 39 | log.info("[{}]视频链接保存在{}",title,downloadFileName); 40 | 41 | log.info("开始下载[{}]下的视频",title); 42 | downloadVideos(dir + File.separatorChar + title, videoUrls); 43 | log.info("结束下载[{}]下的视频",title); 44 | } 45 | } 46 | } 47 | 48 | public void downloadVideos(String dir,List videoUrls){ 49 | for(String videoUrl : videoUrls){ 50 | downloadVideo(dir,videoUrl); 51 | } 52 | } 53 | 54 | public List getVideoUrls(List viewUrls) { 55 | List videoUrls = new ArrayList<>(); 56 | if (viewUrls != null && viewUrls.size() > 0) { 57 | for (String viewUrl : viewUrls) { 58 | String videoUrl = getVideoUrl(viewUrl); 59 | if(StringUtils.isNotBlank(videoUrl)){ 60 | videoUrls.add(videoUrl); 61 | } 62 | } 63 | } 64 | return videoUrls; 65 | } 66 | } 67 | -------------------------------------------------------------------------------- /xxx-spider/src/main/java/com/zsf/xxx/Crawler.java: -------------------------------------------------------------------------------- 1 | package com.zsf.xxx; 2 | 3 | import java.io.IOException; 4 | import java.util.List; 5 | import java.util.Map; 6 | 7 | /** 8 | * 9 | * @author papapa 10 | * 11 | */ 12 | public interface Crawler { 13 | 14 | /** 15 | * 爬虫入口 16 | * 17 | * @param dir 18 | * 文件保存目录 19 | * @throws IOException 20 | */ 21 | void execute(String dir) throws IOException; 22 | 23 | /** 24 | * 获取分类集合 25 | * 26 | * @return 27 | */ 28 | Map getCategories(); 29 | 30 | /** 31 | * 获取视频地址 32 | * 33 | * @param href 34 | * @return 35 | */ 36 | List getViewUrls(String href); 37 | 38 | /** 39 | * 获取视频高清地址 40 | * 41 | * @param viewUrl 42 | * @return 43 | * @throws IOException 44 | */ 45 | String getVideoUrl(String viewUrl); 46 | 47 | /** 48 | * 下载视频 49 | * @param dir 文件保存目录 50 | * @param videoUrl 视频地址 51 | * @return 52 | */ 53 | boolean downloadVideo(String dir,String videoUrl); 54 | } 55 | -------------------------------------------------------------------------------- /xxx-spider/src/main/java/com/zsf/xxx/Launcher.java: -------------------------------------------------------------------------------- 1 | package com.zsf.xxx; 2 | 3 | import java.io.IOException; 4 | import java.io.InputStream; 5 | import java.nio.charset.Charset; 6 | import java.util.List; 7 | 8 | import org.apache.commons.io.FileUtils; 9 | import org.apache.commons.io.IOUtils; 10 | import org.apache.commons.lang3.StringUtils; 11 | import org.slf4j.Logger; 12 | import org.slf4j.LoggerFactory; 13 | 14 | /** 15 | * @author papapa 16 | * 17 | */ 18 | public class Launcher { 19 | 20 | private static final Logger log = LoggerFactory.getLogger(Launcher.class); 21 | 22 | public static void main(String[] args) throws IOException { 23 | InputStream in = Launcher.class.getResourceAsStream("/LICENSE"); 24 | if(in != null){ 25 | List lines = IOUtils.readLines(in,Charset.forName("UTF-8")); 26 | log.info(StringUtils.join(lines,"\r\n")); 27 | } 28 | log.info("爬虫开始"); 29 | String fileDir = FileUtils.getTempDirectoryPath();//默认文件保存目录 30 | if(args != null && args.length > 0){ 31 | fileDir = args[0]; 32 | } 33 | log.info("视频地址文件保存在:{}",fileDir); 34 | //new PornhubCrawler().execute(fileDir); 35 | new XvideosCrawler().execute(fileDir); 36 | log.info("爬虫停止"); 37 | } 38 | } 39 | -------------------------------------------------------------------------------- /xxx-spider/src/main/java/com/zsf/xxx/PornhubCrawler.java: -------------------------------------------------------------------------------- 1 | package com.zsf.xxx; 2 | 3 | import java.io.IOException; 4 | import java.util.ArrayList; 5 | import java.util.HashMap; 6 | import java.util.List; 7 | import java.util.Map; 8 | 9 | import org.apache.commons.lang3.StringUtils; 10 | import org.jsoup.nodes.Document; 11 | import org.jsoup.nodes.Element; 12 | import org.jsoup.select.Elements; 13 | import org.slf4j.Logger; 14 | import org.slf4j.LoggerFactory; 15 | 16 | import com.alibaba.fastjson.JSONPath; 17 | import com.zsf.xxx.http.HttpViewer; 18 | 19 | /** 20 | * @author papapa 21 | * 22 | */ 23 | public class PornhubCrawler extends AbstractCrawler { 24 | 25 | private static final Logger log = LoggerFactory.getLogger(PornhubCrawler.class); 26 | 27 | private static final String BASE_URL = "https://cn.pornhub.com";// 中文pornhub 28 | 29 | private static final String PARAMS = "hd=1";// 高清 30 | 31 | /** 32 | * 下载视频 33 | * 34 | * @param dir 35 | * @param videoUrl 36 | */ 37 | @Override 38 | public boolean downloadVideo(String dir, String videoUrl) { 39 | return false; 40 | } 41 | 42 | /** 43 | * 获取视频地址(720p mp4格式) 44 | * 45 | * @param viewUrl 46 | * @return 47 | * @throws IOException 48 | */ 49 | @Override 50 | public String getVideoUrl(String viewUrl) { 51 | String videoUrl = null; 52 | Document doc; 53 | try { 54 | doc = HttpViewer.getRandomInstance().getResponseDoc(viewUrl); 55 | if (doc == null) { 56 | return null; 57 | } 58 | String scriptStr = doc.selectFirst("div#player > script").toString(); 59 | scriptStr = "{" + StringUtils.substringBetween(scriptStr, "{", "};") + "}"; 60 | 61 | // 获取720p且为mp4格式的地址 62 | videoUrl = (String) JSONPath.eval(scriptStr, "$.mediaDefinitions[quality='720'][format='mp4'][0].videoUrl"); 63 | } catch (IOException e) { 64 | log.error("获取视频地址错误:[" + videoUrl + "]", e); 65 | } 66 | 67 | return videoUrl; 68 | } 69 | 70 | /** 71 | * 获取该分类地址下的所有播放地址 72 | * 73 | * @param href 74 | * @return 75 | * @throws IOException 76 | */ 77 | @Override 78 | public List getViewUrls(String href) { 79 | List viewUrls = new ArrayList<>(); 80 | Document doc; 81 | try { 82 | doc = HttpViewer.getRandomInstance().getResponseDoc(href); 83 | if (doc == null) { 84 | return null; 85 | } 86 | List pageUrls = getPageUrls(doc); 87 | if (pageUrls != null) { 88 | viewUrls.addAll(pageUrls); 89 | } 90 | 91 | Element nextPageElement = doc.selectFirst("div.pagination3 > ul > li.page_next > a"); 92 | if (nextPageElement != null) { 93 | String nextPageHref = BASE_URL + "/" + nextPageElement.attr("href");// 下一页地址 94 | if (nextPageHref.endsWith("page=2")) {// 为了测试只获取1页数据 95 | return viewUrls; 96 | } 97 | viewUrls.addAll(getViewUrls(nextPageHref));// 递规获取每一页中的播放地址 98 | } 99 | } catch (IOException e) { 100 | log.error("获取文档对象错误:[" + href + "]", e); 101 | } 102 | 103 | return viewUrls; 104 | } 105 | 106 | /** 107 | * 获取当前页中每个的播放地址 108 | * 109 | * @param doc 110 | * @return 111 | */ 112 | public List getPageUrls(Document doc) { 113 | List urls = null; 114 | Elements lis = doc.select("ul#videoCategory > li.js-pop"); 115 | if (lis != null && lis.size() > 0) { 116 | urls = new ArrayList<>(); 117 | for (Element li : lis) { 118 | urls.add(BASE_URL + "/view_video.php?viewkey=" + li.attr("_vkey")); 119 | } 120 | } 121 | return urls; 122 | } 123 | 124 | /** 125 | * 获取分类集合 126 | * 127 | * @return 128 | * @throws IOException 129 | */ 130 | @Override 131 | public Map getCategories() { 132 | Map result = null; 133 | Document doc; 134 | try { 135 | doc = HttpViewer.getRandomInstance().getResponseDoc(BASE_URL + "/categories"); 136 | if (doc == null) { 137 | return null; 138 | } 139 | result = new HashMap<>(); 140 | Elements lis = doc.select("ul#categoriesListSection > li"); 141 | for (Element li : lis) { 142 | String href = BASE_URL + li.selectFirst("div.category-wrapper > a").attr("href"); 143 | if (href.contains("?")) {// 只获取高清视频 144 | href += "&" + PARAMS; 145 | } else { 146 | href += "?" + PARAMS; 147 | } 148 | String title = li.selectFirst("div.category-wrapper > h5 > a").attr("data-mxptext"); 149 | result.put(title, href); 150 | log.info("分类:{},地址:{}", title, href); 151 | if (result.size() > 2) 152 | break;// 只获取3个 153 | } 154 | } catch (IOException e) { 155 | log.error("获取分类错误:", e); 156 | } 157 | return result; 158 | } 159 | } 160 | -------------------------------------------------------------------------------- /xxx-spider/src/main/java/com/zsf/xxx/XvideosCrawler.java: -------------------------------------------------------------------------------- 1 | package com.zsf.xxx; 2 | 3 | import java.io.File; 4 | import java.io.IOException; 5 | import java.util.ArrayList; 6 | import java.util.HashMap; 7 | import java.util.List; 8 | import java.util.Map; 9 | 10 | import org.apache.commons.lang3.StringUtils; 11 | import org.jsoup.nodes.Document; 12 | import org.jsoup.nodes.Element; 13 | import org.jsoup.select.Elements; 14 | import org.slf4j.Logger; 15 | import org.slf4j.LoggerFactory; 16 | 17 | import com.zsf.xxx.http.HttpViewer; 18 | import com.zsf.xxx.util.HttpDownloader; 19 | 20 | /** 21 | * @author papapa 22 | * 23 | */ 24 | public class XvideosCrawler extends AbstractCrawler{ 25 | 26 | private static final Logger log = LoggerFactory.getLogger(XvideosCrawler.class); 27 | 28 | private static final String BASE_URL = "https://www.xvideos.com"; 29 | 30 | /** 31 | * 获取视频高清地址 32 | * 33 | * @param viewUrl 34 | * @return 35 | * @throws IOException 36 | */ 37 | @Override 38 | public String getVideoUrl(String viewUrl) { 39 | String videoUrl = null; 40 | Document doc; 41 | try { 42 | doc = HttpViewer.getRandomInstance().getResponseDoc(viewUrl); 43 | if (doc == null) { 44 | return null; 45 | } 46 | Elements scripts = doc.getElementsByTag("script"); 47 | if (scripts == null || scripts.size() == 0) { 48 | return null; 49 | } 50 | for (Element script : scripts) { 51 | String scriptStr = script.html(); 52 | if (StringUtils.contains(scriptStr, "html5player.setVideoUrlHigh('")) { 53 | videoUrl = StringUtils.substringBetween(scriptStr, "html5player.setVideoUrlHigh('", "');"); 54 | break; 55 | } 56 | } 57 | } catch (IOException e) { 58 | log.error("获取视频地址错误:[" + viewUrl + "]", e); 59 | } 60 | 61 | return videoUrl; 62 | } 63 | 64 | /** 65 | * 获取该分类地址下的所有播放地址 66 | * 67 | * @param href 68 | * @return 69 | * @throws IOException 70 | */ 71 | @Override 72 | public List getViewUrls(String href) { 73 | List viewUrls = new ArrayList<>(); 74 | Document doc; 75 | try { 76 | doc = HttpViewer.getRandomInstance().getResponseDoc(href); 77 | if (doc == null) { 78 | return null; 79 | } 80 | List pageUrls = getPageUrls(doc); 81 | if (pageUrls != null) { 82 | viewUrls.addAll(pageUrls); 83 | } 84 | 85 | Element nextPageElement = doc.selectFirst("div.pagination > ul > li > a.next-page"); 86 | if (nextPageElement != null) { 87 | String nextPageHref = BASE_URL + nextPageElement.attr("href");// 下一页地址 88 | if (nextPageHref.contains("/3") || nextPageHref.contains("p=3")) {// 为了测试只获取2页数据 89 | return viewUrls; 90 | } 91 | viewUrls.addAll(getViewUrls(nextPageHref));// 递规获取每一页中的播放地址 92 | } 93 | } catch (IOException e) { 94 | log.error("获取文档对象错误:[" + href + "]", e); 95 | } 96 | 97 | return viewUrls; 98 | } 99 | 100 | /** 101 | * 获取当前页中每个的播放地址 102 | * 103 | * @param doc 104 | * @return 105 | */ 106 | public List getPageUrls(Document doc) { 107 | //Elements links = doc.select("div#content > div.mozaique > div div.thumb > a"); 108 | Elements insides = doc.select("div#content > div.mozaique > div div.thumb-inside"); 109 | if(insides == null || insides.size() == 0){ 110 | return null; 111 | } 112 | List urls = new ArrayList<>(); 113 | for(Element inside : insides){ 114 | Element hdElement = inside.selectFirst("span.video-hd-mark");//获取高清(720p)视频 115 | if(hdElement != null){ 116 | Element linkElement = inside.selectFirst("div.thumb > a"); 117 | String link = linkElement.attr("href"); 118 | urls.add(BASE_URL + link); 119 | } 120 | } 121 | return urls; 122 | } 123 | 124 | @Override 125 | public Map getCategories() { 126 | Map categories = new HashMap<>(); 127 | categories.put("丝袜",BASE_URL+"/c/Stockings-28"); 128 | categories.put("喷水",BASE_URL+"/c/Squirting-56"); 129 | categories.put("女同",BASE_URL+"/?k=lesbian&quality=hd"); 130 | 131 | return categories; 132 | } 133 | 134 | @Override 135 | public boolean downloadVideo(String dir, String videoUrl) { 136 | String fileName = StringUtils.substringAfterLast(videoUrl,"/"); 137 | fileName = StringUtils.substringBefore(fileName, "?"); 138 | String filePath = dir+File.separatorChar+fileName; 139 | try { 140 | new HttpDownloader().download(videoUrl, filePath); 141 | return true; 142 | } catch (IOException e) { 143 | e.printStackTrace(); 144 | } 145 | return false; 146 | } 147 | } 148 | -------------------------------------------------------------------------------- /xxx-spider/src/main/java/com/zsf/xxx/http/AbstractHttpViewer.java: -------------------------------------------------------------------------------- 1 | package com.zsf.xxx.http; 2 | 3 | import java.io.IOException; 4 | import java.time.Duration; 5 | 6 | import org.jsoup.nodes.Document; 7 | 8 | import okhttp3.OkHttpClient; 9 | 10 | /** 11 | * @author papapa 12 | * 13 | */ 14 | public abstract class AbstractHttpViewer implements HttpViewer { 15 | 16 | public static final String USER_AGENT = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.132 Safari/537.36"; 17 | 18 | @Override 19 | public String getResponesStr(String url) throws IOException { 20 | Document doc = getResponseDoc(url); 21 | if(doc != null){ 22 | return doc.html(); 23 | } 24 | return null; 25 | } 26 | 27 | public OkHttpClient.Builder getOkHttpClientBuilder(){ 28 | OkHttpClient.Builder builder = new OkHttpClient.Builder() 29 | .followRedirects(true) 30 | .followSslRedirects(true) 31 | .retryOnConnectionFailure(true) 32 | .readTimeout(Duration.ofSeconds(10L)) 33 | .connectTimeout(Duration.ofSeconds(10L)) 34 | .readTimeout(Duration.ofSeconds(10L)) 35 | .callTimeout(Duration.ofSeconds(10L)); 36 | return builder; 37 | } 38 | } 39 | -------------------------------------------------------------------------------- /xxx-spider/src/main/java/com/zsf/xxx/http/CorsAnywhereViewer.java: -------------------------------------------------------------------------------- 1 | package com.zsf.xxx.http; 2 | 3 | import java.io.IOException; 4 | 5 | import org.jsoup.Jsoup; 6 | import org.jsoup.nodes.Document; 7 | import org.slf4j.Logger; 8 | import org.slf4j.LoggerFactory; 9 | 10 | import okhttp3.Call; 11 | import okhttp3.OkHttpClient; 12 | import okhttp3.Request; 13 | import okhttp3.ResponseBody; 14 | 15 | /** 16 | * @author papapa 17 | * 18 | */ 19 | public class CorsAnywhereViewer extends AbstractHttpViewer { 20 | 21 | private static final Logger log = LoggerFactory.getLogger(CorsAnywhereViewer.class); 22 | 23 | private static final String PROXY_URL = "https://cors-anywhere.herokuapp.com/"; 24 | 25 | @Override 26 | public Document getResponseDoc(String url) throws IOException { 27 | log.info("获取页面:[{}]内容", url); 28 | OkHttpClient client = super.getOkHttpClientBuilder().build(); 29 | Request request = new Request.Builder().url(PROXY_URL + url).header("x-requested-with", "XMLHttpRequest") 30 | .header("User-Agent", USER_AGENT).get().build(); 31 | Call call = client.newCall(request); 32 | okhttp3.Response response = call.execute(); 33 | int code = response.code(); 34 | if (code == 200) { 35 | ResponseBody body = response.body(); 36 | Document doc = Jsoup.parse(body.string()); 37 | body.close(); 38 | response.close(); 39 | return doc; 40 | } 41 | return null; 42 | } 43 | 44 | } 45 | -------------------------------------------------------------------------------- /xxx-spider/src/main/java/com/zsf/xxx/http/HttpDebugerViewer.java: -------------------------------------------------------------------------------- 1 | package com.zsf.xxx.http; 2 | 3 | import java.io.IOException; 4 | import java.util.HashMap; 5 | import java.util.Map; 6 | 7 | import org.apache.commons.text.StringEscapeUtils; 8 | import org.jsoup.Jsoup; 9 | import org.jsoup.nodes.Document; 10 | import org.slf4j.Logger; 11 | import org.slf4j.LoggerFactory; 12 | 13 | import okhttp3.Call; 14 | import okhttp3.FormBody; 15 | import okhttp3.Headers; 16 | import okhttp3.OkHttpClient; 17 | import okhttp3.Request; 18 | import okhttp3.RequestBody; 19 | import okhttp3.ResponseBody; 20 | 21 | /** 22 | * @author papapa 23 | * 24 | */ 25 | public class HttpDebugerViewer extends AbstractHttpViewer { 26 | 27 | private static final Logger log = LoggerFactory.getLogger(HttpDebugerViewer.class); 28 | 29 | private static final String PROXY_URL = "http://www.httpdebugger.com/tools/ViewHttpHeaders.aspx"; 30 | 31 | @Override 32 | public Document getResponseDoc(String url) throws IOException { 33 | log.info("获取页面:[{}]内容", url); 34 | OkHttpClient client = super.getOkHttpClientBuilder().build(); 35 | Request request = new Request.Builder().url(PROXY_URL).headers(Headers.of(getHeaders())).post(getRequestBody(url)).build(); 36 | Call call = client.newCall(request); 37 | okhttp3.Response response = call.execute(); 38 | int code = response.code(); 39 | if (code == 200) { 40 | ResponseBody body = response.body(); 41 | Document doc = Jsoup.parse(body.string()); 42 | body.close(); 43 | response.close(); 44 | String html = doc.selectFirst("div#ResultData pre").html(); 45 | html = StringEscapeUtils.unescapeHtml4(html); 46 | return Jsoup.parse(html); 47 | } 48 | return null; 49 | } 50 | 51 | private RequestBody getRequestBody(String url){ 52 | FormBody.Builder builder = new FormBody.Builder(); 53 | builder.add("UrlBox", url); 54 | builder.add("AgentList", "Google Chrome"); 55 | builder.add("VersionsList", "HTTP/1.1"); 56 | builder.add("MethodList", "GET"); 57 | return builder.build(); 58 | } 59 | 60 | private Map getHeaders(){ 61 | Map headers = new HashMap<>(); 62 | headers.put("Host", "www.httpdebugger.com"); 63 | headers.put("Origin", "http://www.httpdebugger.com"); 64 | headers.put("Pragma", "no-cache"); 65 | headers.put("Referer", "http://www.httpdebugger.com/tools/ViewHttpHeaders.aspx"); 66 | headers.put("Upgrade-Insecure-Requests", "1"); 67 | return headers; 68 | } 69 | } 70 | -------------------------------------------------------------------------------- /xxx-spider/src/main/java/com/zsf/xxx/http/HttpViewer.java: -------------------------------------------------------------------------------- 1 | package com.zsf.xxx.http; 2 | 3 | import java.io.IOException; 4 | 5 | import org.apache.commons.lang3.RandomUtils; 6 | import org.jsoup.nodes.Document; 7 | 8 | /** 9 | * @author papapa 10 | * 11 | */ 12 | public interface HttpViewer { 13 | 14 | /** 15 | * 获取url响应后的html内容 16 | * @param url 17 | * @return 18 | * @throws IOException 19 | */ 20 | String getResponesStr(String url) throws IOException; 21 | 22 | /** 23 | * 获取url响应后的文档对像 24 | * @param url 25 | * @return 26 | * @throws IOException 27 | */ 28 | Document getResponseDoc(String url) throws IOException; 29 | 30 | /** 31 | * 获取httpViewer的一个随机实例 32 | * @return 33 | */ 34 | public static HttpViewer getRandomInstance(){ 35 | HttpViewer[] VIEWERS = new HttpViewer[]{new CorsAnywhereViewer(),new HttpDebugerViewer(),new JsonpAfeldViewer()}; 36 | int index = RandomUtils.nextInt(0, 2); 37 | return VIEWERS[index]; 38 | } 39 | } 40 | -------------------------------------------------------------------------------- /xxx-spider/src/main/java/com/zsf/xxx/http/JsonpAfeldViewer.java: -------------------------------------------------------------------------------- 1 | package com.zsf.xxx.http; 2 | 3 | import java.io.IOException; 4 | 5 | import org.jsoup.Jsoup; 6 | import org.jsoup.nodes.Document; 7 | import org.slf4j.Logger; 8 | import org.slf4j.LoggerFactory; 9 | 10 | import okhttp3.Call; 11 | import okhttp3.OkHttpClient; 12 | import okhttp3.Request; 13 | import okhttp3.ResponseBody; 14 | 15 | /** 16 | * @author papapa 17 | * 18 | */ 19 | public class JsonpAfeldViewer extends AbstractHttpViewer { 20 | 21 | private static final Logger log = LoggerFactory.getLogger(JsonpAfeldViewer.class); 22 | 23 | private static final String PROXY_URL = "https://jsonp.afeld.me/?url="; 24 | 25 | @Override 26 | public Document getResponseDoc(String url) throws IOException { 27 | log.info("获取页面:[{}]内容", url); 28 | OkHttpClient client = super.getOkHttpClientBuilder().build(); 29 | Request request = new Request.Builder().url(PROXY_URL + url).header("User-Agent", USER_AGENT).get().build(); 30 | Call call = client.newCall(request); 31 | okhttp3.Response response = call.execute(); 32 | int code = response.code(); 33 | if (code == 200) { 34 | ResponseBody body = response.body(); 35 | Document doc = Jsoup.parse(body.string()); 36 | body.close(); 37 | response.close(); 38 | return doc; 39 | } 40 | return null; 41 | } 42 | 43 | } 44 | -------------------------------------------------------------------------------- /xxx-spider/src/main/java/com/zsf/xxx/util/BatchRecursiveAction.java: -------------------------------------------------------------------------------- 1 | package com.zsf.xxx.util; 2 | 3 | import java.lang.reflect.Constructor; 4 | import java.lang.reflect.InvocationTargetException; 5 | import java.util.ArrayList; 6 | import java.util.List; 7 | import java.util.concurrent.RecursiveAction; 8 | 9 | /** 10 | * @author papapa 11 | * 12 | */ 13 | @SuppressWarnings("rawtypes") 14 | public abstract class BatchRecursiveAction extends RecursiveAction{ 15 | 16 | private static final long serialVersionUID = -2909644333830555865L; 17 | 18 | private List items; 19 | 20 | private Object ext; 21 | 22 | protected BatchRecursiveAction(List items,Object ext){ 23 | this.items = items; 24 | this.ext = ext; 25 | } 26 | 27 | @SuppressWarnings({ "unchecked" }) 28 | @Override 29 | protected void compute() { 30 | if(items != null && items.size() > 0){ 31 | if(items.size() == 1){ 32 | computeItem(items.get(0)); 33 | }else{ 34 | List actions = new ArrayList<>(); 35 | for(Object item : items){ 36 | List subActions = new ArrayList<>(); 37 | subActions.add(item); 38 | BatchRecursiveAction subBatchRecursiveAction = null; 39 | try { 40 | Constructor constructor = this.getClass().getDeclaredConstructors()[0]; 41 | constructor.setAccessible(true); 42 | subBatchRecursiveAction = (BatchRecursiveAction) constructor.newInstance(subActions,this.ext); 43 | actions.add(subBatchRecursiveAction); 44 | } catch (InstantiationException | IllegalAccessException | IllegalArgumentException 45 | | InvocationTargetException | SecurityException e) { 46 | e.printStackTrace(); 47 | } 48 | } 49 | invokeAll(actions); 50 | for(BatchRecursiveAction action : actions){ 51 | action.join(); 52 | } 53 | } 54 | } 55 | } 56 | 57 | public abstract void computeItem(Object item); 58 | } 59 | -------------------------------------------------------------------------------- /xxx-spider/src/main/java/com/zsf/xxx/util/BatchRecursiveTask.java: -------------------------------------------------------------------------------- 1 | package com.zsf.xxx.util; 2 | 3 | import java.lang.reflect.Constructor; 4 | import java.lang.reflect.InvocationTargetException; 5 | import java.lang.reflect.Type; 6 | import java.util.ArrayList; 7 | import java.util.Arrays; 8 | import java.util.List; 9 | import java.util.concurrent.RecursiveTask; 10 | 11 | /** 12 | * @author papapa 13 | * 14 | */ 15 | @SuppressWarnings("rawtypes") 16 | public abstract class BatchRecursiveTask extends RecursiveTask{ 17 | 18 | private static final long serialVersionUID = 2119394771132854398L; 19 | 20 | private List items; 21 | 22 | private Object ext; 23 | 24 | protected BatchRecursiveTask(List items,Object ext){ 25 | this.items = items; 26 | this.ext = ext; 27 | } 28 | 29 | public Object getExt(){ 30 | return this.ext; 31 | } 32 | 33 | @SuppressWarnings({ "unchecked" }) 34 | @Override 35 | protected List compute() { 36 | List values = new ArrayList<>(); 37 | if(items != null && items.size() > 0){ 38 | if(items.size() == 1){ 39 | values.add(computeItem(items.get(0))); 40 | }else{ 41 | BatchRecursiveTask[] tasks = new BatchRecursiveTask[items.size()]; 42 | int index = 0; 43 | Constructor constructor = this.getClass().getDeclaredConstructors()[0]; 44 | constructor.setAccessible(true); 45 | Type[] types = constructor.getGenericParameterTypes();//匿名内部内有多个参数 46 | Object[] initargs = new Object[types.length]; 47 | for(Object item : items){ 48 | try { 49 | initargs[types.length-1] = this.ext; 50 | initargs[types.length-2] = Arrays.asList(item); 51 | tasks[index] = (BatchRecursiveTask) constructor.newInstance(initargs); 52 | index++; 53 | } catch (InstantiationException | IllegalAccessException | IllegalArgumentException 54 | | InvocationTargetException | SecurityException e) { 55 | e.printStackTrace(); 56 | } 57 | } 58 | invokeAll(tasks); 59 | for(BatchRecursiveTask task : tasks){ 60 | values.addAll(task.join()); 61 | } 62 | } 63 | } 64 | return values; 65 | } 66 | public abstract Object computeItem(Object item); 67 | } -------------------------------------------------------------------------------- /xxx-spider/src/main/java/com/zsf/xxx/util/DownloadBatchRecursiveTask.java: -------------------------------------------------------------------------------- 1 | package com.zsf.xxx.util; 2 | 3 | import java.io.File; 4 | import java.io.FileOutputStream; 5 | import java.io.InputStream; 6 | import java.io.OutputStream; 7 | import java.net.HttpURLConnection; 8 | import java.net.SocketTimeoutException; 9 | import java.net.URL; 10 | import java.util.List; 11 | import java.util.concurrent.TimeUnit; 12 | 13 | import javax.net.ssl.SSLException; 14 | 15 | import org.apache.commons.io.FileUtils; 16 | import org.apache.commons.io.IOUtils; 17 | 18 | import lombok.extern.slf4j.Slf4j; 19 | 20 | /** 21 | * @author papapa 22 | * 23 | */ 24 | @Slf4j 25 | public class DownloadBatchRecursiveTask extends BatchRecursiveTask { 26 | 27 | private static final long serialVersionUID = -6577872267278165509L; 28 | 29 | private static final int RETRY_TIMES = 10;// 重试次数 30 | 31 | private static final int CONNECTION_TIMEOUT = 10 * 1000;//连接超时时间10秒 32 | 33 | private static final int READ_TIMEOUT = 10 * 1000;//读超时时间10秒 34 | 35 | protected DownloadBatchRecursiveTask(List items, Object ext) { 36 | super(items, ext); 37 | } 38 | 39 | @Override 40 | public Object computeItem(Object item) { 41 | FileItemInfo itemInfo = (FileItemInfo) item; 42 | String filePath = itemInfo.getFilePath(); 43 | File file = new File(filePath); 44 | long downloadLength = getDownloadLength(itemInfo); 45 | if (file.exists()) { 46 | if (file.length() == downloadLength) { 47 | log.info("分片[{}]已下载成功",itemInfo.getPartIndex()); 48 | itemInfo.setDownloadSuccess(true); 49 | return itemInfo; 50 | } 51 | } 52 | downloadPart(itemInfo, RETRY_TIMES); 53 | return itemInfo; 54 | } 55 | 56 | private FileItemInfo downloadPart(FileItemInfo itemInfo, int retryTimes) { 57 | String filePath = itemInfo.getFilePath(); 58 | int partIndex = itemInfo.getPartIndex(); 59 | 60 | String url = itemInfo.getUrl(); 61 | String range = getRange(itemInfo); 62 | InputStream input = null; 63 | OutputStream output = null; 64 | if(range == null){ 65 | itemInfo.setDownloadSuccess(true); 66 | log.info("分片[{}]下存在,无需下载",itemInfo.getPartIndex()); 67 | return itemInfo; 68 | } 69 | if (retryTimes == 0) { 70 | log.error("分片[{}]重试5次后依旧下载失败",partIndex); 71 | return null; 72 | } 73 | log.info("分片[{}],range:{},文件:{}",partIndex, range,filePath); 74 | HttpURLConnection connection = null; 75 | try { 76 | connection = (HttpURLConnection) new URL(url).openConnection(); 77 | connection.setConnectTimeout(CONNECTION_TIMEOUT); 78 | connection.setReadTimeout(READ_TIMEOUT); 79 | 80 | HttpURLConnection.setFollowRedirects(true); 81 | connection.setRequestProperty("range", "bytes=" + range); 82 | connection.connect(); 83 | // 获取响应吗 84 | int responseCode = connection.getResponseCode(); 85 | if (responseCode != 206 && responseCode != 200) { 86 | return null; 87 | } 88 | input = connection.getInputStream(); 89 | output = new FileOutputStream(filePath, true); 90 | IOUtils.copy(input, output, 1024); 91 | output.flush(); 92 | connection.disconnect(); 93 | itemInfo.setDownloadSuccess(true); 94 | log.info("分片[{}]下载成功",partIndex); 95 | } catch (Exception e) { 96 | if(!(e instanceof SocketTimeoutException) && !(e instanceof SSLException)){ 97 | log.error("分片["+partIndex+"]下载失败:",e); 98 | } 99 | retryTimes--; 100 | log.error("分片[{}]开始重试第[{}]次",partIndex,RETRY_TIMES - retryTimes); 101 | try { 102 | TimeUnit.SECONDS.sleep(1L); 103 | } catch (InterruptedException e1) { 104 | e1.printStackTrace(); 105 | } 106 | if(connection != null){ 107 | connection.disconnect(); 108 | } 109 | IOUtils.closeQuietly(input); 110 | IOUtils.closeQuietly(output); 111 | 112 | downloadPart(itemInfo, retryTimes); 113 | } 114 | return itemInfo; 115 | } 116 | 117 | /** 118 | * 要下载的长度 119 | * @param itemInfo 120 | * @return 121 | */ 122 | private long getDownloadLength(FileItemInfo itemInfo){ 123 | long downloadLength = itemInfo.getEndIndex() - itemInfo.getStartIndex() + (itemInfo.isLastSharding() ? 0 : 1); 124 | return downloadLength; 125 | } 126 | 127 | /** 128 | * 获取range 129 | * @param itemInfo 130 | * @return 当返回null时表示不需要再重新下载 131 | */ 132 | private String getRange(FileItemInfo itemInfo){ 133 | String filePath = itemInfo.getFilePath(); 134 | File tempFile = new File(filePath); 135 | FileInfo fileInfo = itemInfo.getFileInfo(); 136 | long contentLength = fileInfo.getContentLength(); 137 | 138 | long tempContentLength = tempFile.exists() ? FileUtils.sizeOf(tempFile) : 0; 139 | log.info("临时文件[{}]大小{}Bytes",tempFile.getAbsolutePath(),tempContentLength); 140 | String range = itemInfo.getStartIndex() + "-"; 141 | if(tempContentLength == 0){ 142 | if (itemInfo.getEndIndex() != contentLength) { 143 | range += itemInfo.getEndIndex(); 144 | } 145 | }else{ 146 | if (getDownloadLength(itemInfo) == tempContentLength) { 147 | range = null; 148 | }else{ 149 | range = itemInfo.getStartIndex() + tempContentLength + "-"+ itemInfo.getEndIndex(); 150 | } 151 | } 152 | return range; 153 | } 154 | } 155 | -------------------------------------------------------------------------------- /xxx-spider/src/main/java/com/zsf/xxx/util/FileInfo.java: -------------------------------------------------------------------------------- 1 | package com.zsf.xxx.util; 2 | 3 | import lombok.Getter; 4 | import lombok.Setter; 5 | import lombok.experimental.Accessors; 6 | 7 | /** 8 | * @author papapa 9 | * 10 | */ 11 | @Getter 12 | @Setter 13 | @Accessors(chain = true) 14 | public class FileInfo { 15 | 16 | private String url;//文件原始下载地址 17 | 18 | private String filePath;//文件保存路径 19 | 20 | private long contentLength;// 内容长度 21 | 22 | private String contentType;// 内容类型 23 | 24 | private long lastModified;// 最后修改时间 25 | 26 | private boolean suportSharding;//是否支持分片 27 | 28 | } 29 | -------------------------------------------------------------------------------- /xxx-spider/src/main/java/com/zsf/xxx/util/FileItemInfo.java: -------------------------------------------------------------------------------- 1 | package com.zsf.xxx.util; 2 | 3 | import lombok.Getter; 4 | import lombok.Setter; 5 | import lombok.experimental.Accessors; 6 | 7 | /** 8 | * @author papapa 9 | * 10 | */ 11 | @Getter 12 | @Setter 13 | @Accessors(chain = true) 14 | public class FileItemInfo implements Comparable { 15 | 16 | private FileInfo fileInfo; 17 | 18 | private String url;// 下载地址 19 | 20 | private String filePath;// 文件名 21 | 22 | private int partIndex;// 分片位置 23 | 24 | private long startIndex;// 文件开始下载位置 25 | 26 | private long endIndex;// 文件结束位置 27 | 28 | private boolean downloadSuccess;//是否下载成功 29 | 30 | private boolean firstSharding;//是否首片 31 | 32 | private boolean lastSharding;//是否最后一片 33 | 34 | @Override 35 | public int compareTo(FileItemInfo o) { 36 | int partIndex1 = this.getPartIndex(); 37 | int partIndex2 = o.getPartIndex(); 38 | if (partIndex1 < partIndex2) { 39 | return -1; 40 | } else if (partIndex1 > partIndex2) { 41 | return 1; 42 | } else { 43 | return 0; 44 | } 45 | } 46 | } 47 | -------------------------------------------------------------------------------- /xxx-spider/src/main/java/com/zsf/xxx/util/HttpDownloader.java: -------------------------------------------------------------------------------- 1 | package com.zsf.xxx.util; 2 | 3 | import java.io.File; 4 | import java.io.IOException; 5 | import java.io.RandomAccessFile; 6 | import java.net.HttpURLConnection; 7 | import java.net.URL; 8 | import java.util.ArrayList; 9 | import java.util.Collections; 10 | import java.util.List; 11 | 12 | import org.apache.commons.codec.digest.DigestUtils; 13 | import org.apache.commons.io.FileUtils; 14 | import org.apache.commons.lang3.builder.ToStringBuilder; 15 | import org.apache.commons.lang3.builder.ToStringStyle; 16 | 17 | import lombok.extern.slf4j.Slf4j; 18 | 19 | /** 20 | * @author papapa 21 | * 22 | */ 23 | @Slf4j 24 | public class HttpDownloader { 25 | 26 | private static final String TMP_PATH = FileUtils.getTempDirectoryPath();// 临时文件保存目录 27 | 28 | private static final int CONNECTION_TIMEOUT = 10 * 1000;//连接超时时间10秒 29 | 30 | private static final int READ_TIMEOUT = 10 * 1000;//读超时时间10秒 31 | 32 | 33 | public void download(String url, String filePath) throws IOException { 34 | FileInfo fileInfo = getFileInfo(url); 35 | if (fileInfo == null) { 36 | throw new IOException("获取文件信息失败:["+url+"]"); 37 | } 38 | fileInfo.setFilePath(filePath); 39 | log.info("文件信息:{}",ToStringBuilder.reflectionToString(fileInfo, ToStringStyle.JSON_STYLE)); 40 | 41 | File savedFile = new File(filePath); 42 | if (savedFile.exists()) { 43 | if (FileUtils.sizeOf(savedFile) == fileInfo.getContentLength()) { 44 | log.info("文件已下载完成:[{}]", filePath); 45 | return; 46 | } 47 | FileUtils.forceDelete(savedFile); 48 | } 49 | savedFile.createNewFile(); 50 | if(fileInfo.isSuportSharding()){//支持分片下载 51 | long contentLength = fileInfo.getContentLength(); 52 | int threadNum = Runtime.getRuntime().availableProcessors() * 2; 53 | if(contentLength <= threadNum){ 54 | threadNum = 1; 55 | } 56 | multiThreadsDownload(fileInfo, threadNum); 57 | }else{ 58 | singleDownload(fileInfo); 59 | } 60 | } 61 | 62 | /** 63 | * 单线程下载 64 | * @param fileInfo 65 | */ 66 | private void singleDownload(FileInfo fileInfo) { 67 | 68 | } 69 | 70 | /** 71 | * 开始下载文件 72 | * @param fileInfo 73 | * @param threadNum 74 | * @throws IOException 75 | */ 76 | private void multiThreadsDownload(FileInfo fileInfo, int threadNum) throws IOException { 77 | long contentLength = fileInfo.getContentLength(); 78 | if(contentLength <= threadNum){//内容长度小于等于线程数时,只需要一个线程下载 79 | threadNum = 1; 80 | } 81 | List fileItemInfos = new ArrayList<>(); 82 | 83 | long partSize = contentLength / threadNum + 1; 84 | 85 | log.info("每片大小[{}]Bytes",partSize); 86 | for (int i = 0; i < threadNum; i++) { 87 | long startIndex = i * partSize; 88 | startIndex = i == 0 ? 0 : fileItemInfos.get(i-1).getEndIndex()+1L; 89 | 90 | long endIndex = startIndex + partSize -1 ; 91 | 92 | if (i == threadNum - 1) { 93 | endIndex = fileInfo.getContentLength(); 94 | } 95 | log.info("分片[{}]区间:{}-{},大小:{}",i+1,startIndex,endIndex,endIndex-startIndex+1); 96 | FileItemInfo fileItemInfo = new FileItemInfo(); 97 | fileItemInfo.setPartIndex(i+1); 98 | fileItemInfo.setUrl(fileInfo.getUrl()); 99 | fileItemInfo.setStartIndex(startIndex); 100 | fileItemInfo.setFilePath(getTmpFilePath(fileInfo, i+1)); 101 | fileItemInfo.setEndIndex(endIndex); 102 | fileItemInfo.setFileInfo(fileInfo); 103 | fileItemInfo.setFirstSharding(i==0); 104 | fileItemInfo.setLastSharding(i == threadNum - 1); 105 | fileItemInfos.add(fileItemInfo); 106 | } 107 | 108 | @SuppressWarnings("unchecked") 109 | List results = (List) ParallelComputeUtil.compute(new DownloadBatchRecursiveTask(fileItemInfos, null)); 110 | mergeFile(results); 111 | 112 | } 113 | 114 | /** 115 | * 合并文件 116 | * @param results 117 | * @throws IOException 118 | */ 119 | private void mergeFile(List results) throws IOException { 120 | if(results == null){ 121 | log.info("下载失败"); 122 | return; 123 | } 124 | Collections.sort(results); 125 | for(FileItemInfo itemInfo : results){ 126 | if(!itemInfo.isDownloadSuccess()){ 127 | log.error("分片[{}]下载失败,取消合并文件",itemInfo.getPartIndex()); 128 | return; 129 | } 130 | } 131 | addPartIntoFile(results); 132 | String filePath = results.get(0).getFileInfo().getFilePath(); 133 | log.info("文件[{}]下载完成,大小:{}Bytes",filePath,FileUtils.sizeOf(new File(filePath))); 134 | deleteTempFile(results); 135 | } 136 | 137 | /** 138 | * 删除临时文件 139 | * @param items 140 | */ 141 | private void deleteTempFile(List items){ 142 | for(FileItemInfo item : items){ 143 | boolean flag = FileUtils.deleteQuietly(new File(item.getFilePath())); 144 | log.info("临时文件[{}]删除{}",item.getFilePath(),flag ? "成功" : "失败"); 145 | } 146 | } 147 | /** 148 | * 将分片下载的文件写入主文件 149 | * @param itemInfo 150 | * @throws IOException 151 | */ 152 | private void addPartIntoFile(List itemInfos) throws IOException{ 153 | String filePath = itemInfos.get(0).getFileInfo().getFilePath(); 154 | RandomAccessFile file = new RandomAccessFile(new File(filePath), "rw"); 155 | for(FileItemInfo itemInfo : itemInfos){ 156 | file.seek(itemInfo.getStartIndex()); 157 | File tempFile = new File(itemInfo.getFilePath()); 158 | file.write(FileUtils.readFileToByteArray(tempFile)); 159 | } 160 | file.close(); 161 | } 162 | 163 | /** 164 | * 获取文件信息 165 | * @param url 166 | * @return 167 | * @throws IOException 168 | */ 169 | private FileInfo getFileInfo(String url) throws IOException { 170 | HttpURLConnection connection = (HttpURLConnection) new URL(url).openConnection(); 171 | connection.setConnectTimeout(CONNECTION_TIMEOUT); 172 | connection.setReadTimeout(READ_TIMEOUT); 173 | HttpURLConnection.setFollowRedirects(true); 174 | connection.connect(); 175 | // 获取响应吗 176 | int responseCode = connection.getResponseCode(); 177 | log.info("[{}]响应码为:{}", url, responseCode); 178 | if (responseCode != 200) { 179 | return null; 180 | } 181 | boolean suportSharding = false; 182 | if("bytes".equals(connection.getHeaderField("Accept-Ranges"))){ 183 | suportSharding = true; 184 | } 185 | 186 | String contentType = connection.getContentType(); 187 | long contentLength = connection.getContentLengthLong(); 188 | long lastModified = connection.getLastModified(); 189 | connection.disconnect(); 190 | 191 | FileInfo fileInfo = new FileInfo().setUrl(url).setContentLength(contentLength).setContentType(contentType) 192 | .setLastModified(lastModified).setSuportSharding(suportSharding); 193 | 194 | return fileInfo; 195 | } 196 | 197 | /** 198 | * 获取临时文件路径 199 | * 200 | * @param url 201 | * @return 202 | */ 203 | private String getTmpFilePath(FileInfo fileInfo, int partIndex) { 204 | String contentType = fileInfo.getContentType(); 205 | long contentLength = fileInfo.getContentLength(); 206 | long lastModified = fileInfo.getLastModified(); 207 | return TMP_PATH+DigestUtils.md5Hex(contentType + contentLength + lastModified) + ".tmp_" + partIndex; 208 | //return StringUtils.substringBeforeLast(filePath, File.separator)+File.separator+DigestUtils.md5Hex(url + filePath + contentType + contentLength + lastModified) + ".tmp_" + partIndex; 209 | } 210 | } 211 | -------------------------------------------------------------------------------- /xxx-spider/src/main/java/com/zsf/xxx/util/ParallelComputeUtil.java: -------------------------------------------------------------------------------- 1 | package com.zsf.xxx.util; 2 | 3 | import java.util.List; 4 | import java.util.concurrent.ForkJoinPool; 5 | import java.util.concurrent.RecursiveAction; 6 | import java.util.concurrent.RecursiveTask; 7 | 8 | /** 9 | * @author papapa 并发调用工具类 10 | */ 11 | public class ParallelComputeUtil { 12 | 13 | public static V compute(RecursiveTask recursiveTask) { 14 | ForkJoinPool forkJoinPool = new ForkJoinPool(); 15 | V value = forkJoinPool.invoke(recursiveTask); 16 | forkJoinPool.shutdown(); 17 | return value; 18 | } 19 | 20 | public static List compute(BatchRecursiveTask recursiveTask) { 21 | ForkJoinPool forkJoinPool = new ForkJoinPool(Runtime.getRuntime().availableProcessors() * 2); 22 | List value = forkJoinPool.invoke(recursiveTask); 23 | forkJoinPool.shutdown(); 24 | return value; 25 | } 26 | 27 | public static void execute(RecursiveAction recursiveAction) { 28 | ForkJoinPool forkJoinPool = new ForkJoinPool(); 29 | forkJoinPool.execute(recursiveAction); 30 | forkJoinPool.shutdown(); 31 | } 32 | 33 | public static void execute(BatchRecursiveAction recursiveAction) { 34 | ForkJoinPool forkJoinPool = new ForkJoinPool(); 35 | forkJoinPool.execute(recursiveAction); 36 | forkJoinPool.shutdown(); 37 | } 38 | } 39 | -------------------------------------------------------------------------------- /xxx-spider/src/main/resources/LICENSE: -------------------------------------------------------------------------------- 1 | 技术无罪,请把你的“技术”应用到实战中! -------------------------------------------------------------------------------- /xxx-spider/src/main/resources/log4j2.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | -------------------------------------------------------------------------------- /xxx-spider/src/test/java/com/zsf/xxx/JunitTest.java: -------------------------------------------------------------------------------- 1 | package com.zsf.xxx; 2 | 3 | import java.io.File; 4 | import java.io.FileInputStream; 5 | import java.io.FileNotFoundException; 6 | import java.io.FileOutputStream; 7 | import java.io.IOException; 8 | import java.io.InputStream; 9 | import java.net.HttpURLConnection; 10 | import java.net.URL; 11 | import java.util.HashMap; 12 | import java.util.Map; 13 | 14 | import org.apache.commons.io.IOUtils; 15 | import org.apache.commons.lang3.StringUtils; 16 | import org.apache.tika.exception.TikaException; 17 | import org.apache.tika.metadata.Metadata; 18 | import org.apache.tika.parser.AutoDetectParser; 19 | import org.apache.tika.sax.BodyContentHandler; 20 | import org.jsoup.nodes.Document; 21 | import org.jsoup.select.Elements; 22 | import org.junit.Test; 23 | import org.xml.sax.SAXException; 24 | 25 | import com.alibaba.fastjson.JSON; 26 | import com.alibaba.fastjson.JSONPath; 27 | import com.zsf.xxx.http.HttpDebugerViewer; 28 | import com.zsf.xxx.http.HttpViewer; 29 | import com.zsf.xxx.http.JsonpAfeldViewer; 30 | import com.zsf.xxx.util.HttpDownloader; 31 | 32 | import lombok.extern.slf4j.Slf4j; 33 | 34 | /** 35 | * @author papapa 36 | * 37 | */ 38 | @Slf4j 39 | public class JunitTest { 40 | 41 | @Test 42 | public void testGetVideoUrl() throws IOException { 43 | String viewUrl = "https://cn.pornhub.com/view_video.php?viewkey=ph5d56b96c279c8"; 44 | Document doc = HttpViewer.getRandomInstance().getResponseDoc(viewUrl); 45 | String scriptStr = doc.selectFirst("div#player > script").toString(); 46 | scriptStr = "{" + StringUtils.substringBetween(scriptStr, "{", "};") + "}"; 47 | 48 | // JSONObject json = JSON.parseObject(scriptStr,JSONObject.class); 49 | String videoUrl = (String) JSONPath.eval(scriptStr, 50 | "$.mediaDefinitions[quality='720'][format='mp4'][0].videoUrl");// 获取720p且为mp4格式的地址 51 | 52 | System.out.println(videoUrl); 53 | } 54 | 55 | public static void main(String[] args) throws IOException { 56 | //String url = "https://vid3-l3.xvideos-cdn.com/videos/mp4/2/1/3/xvideos.com_213d9cdeb355e88c4ac217af62911445.mp4?e=1568258415&ri=1024&rs=85&h=b6b50269fb96d71fb8577e0ac76ca7bb"; 57 | String url = "https://vid1-l3.xvideos-cdn.com/videos/mp4/9/7/0/xvideos.com_9707facedd27ab26bfed74db83328504.mp4?e=1568822783&ri=1024&rs=85&h=42d19ed678f5cf1fa3be0594e63a54d1"; 58 | //String url = "https://ardownload2.adobe.com/pub/adobe/reader/mac/AcrobatDC/1901220034/AcroRdrDC_1901220034_MUI.dmg"; 59 | String filePath = "/Users/zsf/git/xvidoes/mp4/fontawesome-free-5.9.0-web.zip"; 60 | long startTime = System.currentTimeMillis(); 61 | new HttpDownloader().download(url, filePath); 62 | log.info("共耗时:{}秒",System.currentTimeMillis() - startTime); 63 | } 64 | 65 | @Test 66 | public void test2() throws IOException{ 67 | String url = "https://pcs.baidu.com/rest/2.0/pcs/file?method=download&path=%2F%E7%94%B5%E5%BD%B1%2F%E4%BE%8F%E7%BD%97%E7%BA%AA%E5%85%AC%E5%9B%AD%E4%B8%89%E9%83%A8%E6%9B%B2%2FJurassic.Park.1.1993.mkv&random=0.7139769215592586&app_id=498065"; 68 | String filePath = "/Users/zsf/git/xvidoes/mp4/test.mkv"; 69 | HttpURLConnection connection = (HttpURLConnection) new URL(url).openConnection(); 70 | connection.setConnectTimeout(6 * 1000);// 超时6秒重连 71 | connection.setReadTimeout(6 * 1000); 72 | 73 | HttpURLConnection.setFollowRedirects(true); 74 | connection.setRequestProperty("range", "bytes=10-"); 75 | connection.connect(); 76 | // 获取响应吗 77 | int responseCode = connection.getResponseCode(); 78 | if (responseCode != 206 && responseCode != 200) { 79 | return; 80 | } 81 | InputStream input = connection.getInputStream(); 82 | IOUtils.copy(input, new FileOutputStream(new File(filePath)),1024); 83 | connection.disconnect(); 84 | } 85 | 86 | @Test 87 | public void testHttpDebugerViewer() throws IOException{ 88 | Document doc = new HttpDebugerViewer().getResponseDoc("https://www.xvideos.com/c/Squirting-56?quality=hd"); 89 | Elements insides = doc.select("div#content > div.mozaique > div div.thumb-inside"); 90 | log.info(insides.html()); 91 | } 92 | 93 | @Test 94 | public void testJsonpAfeldViewer() throws IOException{ 95 | Document doc = new JsonpAfeldViewer().getResponseDoc("https://www.xvideos.com/c/Squirting-56?quality=hd"); 96 | Elements insides = doc.select("div#content > div.mozaique > div div.thumb-inside"); 97 | log.info(insides.html()); 98 | } 99 | 100 | @Test 101 | public void testTika() throws FileNotFoundException { 102 | InputStream input = new FileInputStream(new File("/Users/zsf/Desktop/2019年秋学籍模板.xls")); 103 | Map metaData = getMetaData(input); 104 | log.info(JSON.toJSONString(metaData, true)); 105 | } 106 | 107 | private Map getMetaData(InputStream input){ 108 | Map metadataMap = null; 109 | BodyContentHandler handler = new BodyContentHandler(); 110 | Metadata metadata = new Metadata(); 111 | AutoDetectParser parser = new AutoDetectParser(); 112 | try { 113 | parser.parse(new FileInputStream(new File("/Users/zsf/Desktop/2019年秋学籍模板.xls")), handler, metadata); 114 | metadata.add("downloadUrl", "https://www.baidu.com"); 115 | metadataMap = new HashMap<>(); 116 | String[] metadataNames = metadata.names(); 117 | for(String metaName : metadataNames){ 118 | metadataMap.put(metaName,metadata.get(metaName)); 119 | } 120 | } catch (IOException | SAXException | TikaException e) { 121 | e.printStackTrace(); 122 | } 123 | return metadataMap; 124 | } 125 | } 126 | --------------------------------------------------------------------------------