├── .gitignore ├── .idea ├── .gitignore ├── artifacts │ └── jdread_downloader_jar.xml ├── compiler.xml ├── encodings.xml ├── jarRepositories.xml ├── misc.xml └── vcs.xml ├── README.md ├── pom.xml └── src └── main ├── java └── io │ └── github │ └── lovelyjuice │ ├── Main.java │ └── ReaderPage.java └── resources └── META-INF └── MANIFEST.MF /.gitignore: -------------------------------------------------------------------------------- 1 | /*.html 2 | /*.exe 3 | /out/ 4 | /target/ 5 | -------------------------------------------------------------------------------- /.idea/.gitignore: -------------------------------------------------------------------------------- 1 | # Default ignored files 2 | /shelf/ 3 | /workspace.xml 4 | # Editor-based HTTP Client requests 5 | /httpRequests/ 6 | # Datasource local storage ignored files 7 | /dataSources/ 8 | /dataSources.local.xml 9 | -------------------------------------------------------------------------------- /.idea/artifacts/jdread_downloader_jar.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | $PROJECT_DIR$/out/artifacts/jdread_downloader_jar 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | 40 | 41 | 42 | 43 | 44 | 45 | 46 | 47 | 48 | 49 | 50 | 51 | 52 | 53 | 54 | 55 | 56 | 57 | 58 | 59 | 60 | 61 | 62 | 63 | 64 | 65 | 66 | 67 | 68 | 69 | 70 | 71 | 72 | 73 | -------------------------------------------------------------------------------- /.idea/compiler.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | -------------------------------------------------------------------------------- /.idea/encodings.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | -------------------------------------------------------------------------------- /.idea/jarRepositories.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 9 | 10 | 14 | 15 | 19 | 20 | -------------------------------------------------------------------------------- /.idea/misc.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 10 | 11 | 12 | 13 | 14 | -------------------------------------------------------------------------------- /.idea/vcs.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | ## 原理 2 | 通过控制浏览器翻页的方式获取书籍内容并保存到本地。 3 | 4 | ## 使用方法 5 | 6 | 1. 安装jdk,jdk版本11以上,过程略。 7 | 2. 下载ChromeDriver,和jdread-downloader.jar放在同一目录。ChromeDriver大版本要等于你安装的chrome版本,比如chrome 110就得用ChromeDriver 110版本。下载链接: https://chromedriver.storage.googleapis.com/index.html 8 | 3. 在jar文件所在目录打开终端,运行jar包:`java -jar jdread-downloader.jar` 9 | 4. 运行后会自动打开chrome,并跳转到登录页面,登录后打开书籍的第一页,向下滚动页面直至能看到“下一章”的按钮 10 | 5. 回到命令行窗口输入任意字符按回车开始爬取,爬取完成后会生成HTML页面,如果想离线保存图片的话可以用浏览器打开然后右键另存为完整的HTML文件。 11 | 6. 操作浏览器打开下一本书的封面,重复上一个步骤。 -------------------------------------------------------------------------------- /pom.xml: -------------------------------------------------------------------------------- 1 | 2 | 5 | 4.0.0 6 | 7 | io.github.lovelyjuice 8 | jdread-downloader 9 | 1.0-SNAPSHOT 10 | 11 | 12 | 11 13 | 11 14 | UTF-8 15 | 16 | 17 | 18 | 19 | org.seleniumhq.selenium 20 | selenium-java 21 | 4.4.0 22 | 23 | 24 | 25 | -------------------------------------------------------------------------------- /src/main/java/io/github/lovelyjuice/Main.java: -------------------------------------------------------------------------------- 1 | package io.github.lovelyjuice; 2 | 3 | import org.openqa.selenium.By; 4 | import org.openqa.selenium.NoSuchElementException; 5 | import org.openqa.selenium.chrome.ChromeDriver; 6 | import org.openqa.selenium.support.ui.WebDriverWait; 7 | 8 | import java.io.FileNotFoundException; 9 | import java.io.PrintWriter; 10 | import java.time.Duration; 11 | import java.util.LinkedHashSet; 12 | import java.util.Scanner; 13 | import java.util.stream.Collectors; 14 | 15 | public class Main { 16 | public static void main(String[] args) throws FileNotFoundException { 17 | System.setProperty("webdriver.chrome.driver", "chromedriver.exe"); 18 | ChromeDriver driver = new ChromeDriver(); 19 | driver.get("https://passport.jd.com/new/login.aspx?ReturnUrl=https%3A%2F%2Febooks.jd.com%2Fbookshelf"); 20 | System.out.println("操作浏览器登录并跳转到书籍第一页后,输入任意字符开始,输入q退出:"); 21 | while (!(new Scanner(System.in).next()).equals("q")) { 22 | try { 23 | var contentBuilder = new StringBuilder(""); 24 | var styleSet = new LinkedHashSet(); 25 | var readerPage = new ReaderPage(driver); 26 | var bookName = readerPage.bookName.getAttribute("innerText"); 27 | System.out.printf("开始爬取《%s》%n", bookName); 28 | int debugTimes = 0; 29 | String debugFlag = System.getenv("debug"); 30 | while (true) { 31 | if (debugTimes++ > 6 && debugFlag != null) break; //调试的时候只爬取6章 32 | String contentHtml = readerPage.content.getAttribute("outerHTML"); 33 | System.out.println(contentHtml.split("reader-chapter-content")[1].substring(0, 50)); 34 | contentBuilder.append(contentHtml); 35 | styleSet.addAll(readerPage.styleSheetList.stream() //有些书不同章节有不同的css样式,所以试着合并这些css 36 | .map(webElement -> webElement.getAttribute("outerHTML")) 37 | .collect(Collectors.toList())); 38 | System.out.println("-----------------------"); 39 | try { 40 | readerPage.nextChapterMiddleButton.click(); 41 | } catch (NoSuchElementException e) { 42 | break; //找不到“下一章”按钮说明已经浏览到最后一章 43 | } 44 | new WebDriverWait(driver, Duration.ofSeconds(300)) 45 | .until(a -> a.findElement(By.cssSelector("div.reader-chapter-content"))); //“下一章”按钮在正文之前被渲染出来,所以只要渲染出按钮就可以跳转到下一章 46 | } 47 | contentBuilder.append(""); 48 | bookName = bookName.replace(":", ":").replace("?", "?").replace("\"", "“") 49 | .replaceAll("[\\\\\\/\\*<>\\|]", "_"); //防止书名中存在Windows不允许的文件名字符 50 | PrintWriter writer = new PrintWriter(bookName + ".html"); 51 | String staticStylesheet = ""; // main.css中阅读区域的默认样式 57 | writer.write("" + staticStylesheet + String.join("\n", styleSet) + ""); 58 | String content = contentBuilder.toString().replace("min-height: ;", "") 59 | .replace("; height: \"", ";\"") //京东前端写的css缺少属性值,不删掉的话转换epub时会报错,不过其实报错问题也不大 60 | .replaceAll("min-width:(.*?);", ""); //解除图片最小宽度限制,小屏设备也能轻松查看,但是对于百分比宽度的图片无效 61 | writer.write(content); 62 | writer.close(); 63 | System.out.printf("《%s》下载完成!%n", bookName); 64 | } catch (Exception e) { 65 | System.out.println(e); 66 | System.out.println("出错了,请重试!"); 67 | } 68 | System.out.println("跳转到书籍第一页后,输入任意字符开始,输入q退出:"); 69 | } 70 | driver.quit(); 71 | } 72 | } -------------------------------------------------------------------------------- /src/main/java/io/github/lovelyjuice/ReaderPage.java: -------------------------------------------------------------------------------- 1 | package io.github.lovelyjuice; 2 | 3 | import org.openqa.selenium.WebDriver; 4 | import org.openqa.selenium.WebElement; 5 | import org.openqa.selenium.support.FindBy; 6 | import org.openqa.selenium.support.PageFactory; 7 | import java.util.List; 8 | 9 | public class ReaderPage { 10 | 11 | public ReaderPage(WebDriver driver) { 12 | PageFactory.initElements(driver, this); 13 | } 14 | 15 | 16 | @FindBy(css = "div.reader-chapter-content") 17 | public WebElement content; 18 | 19 | @FindBy(css = "button.nextChapter") 20 | public WebElement nextChapterMiddleButton; 21 | 22 | @FindBy(css = "head > link[rel='stylesheet']") 23 | public List styleSheetList; 24 | 25 | @FindBy(css = "title") 26 | public WebElement bookName; 27 | } -------------------------------------------------------------------------------- /src/main/resources/META-INF/MANIFEST.MF: -------------------------------------------------------------------------------- 1 | Manifest-Version: 1.0 2 | Main-Class: io.github.lovelyjuice.Main 3 | 4 | --------------------------------------------------------------------------------