├── .gitignore
├── .idea
├── .gitignore
├── artifacts
│ └── jdread_downloader_jar.xml
├── compiler.xml
├── encodings.xml
├── jarRepositories.xml
├── misc.xml
└── vcs.xml
├── README.md
├── pom.xml
└── src
└── main
├── java
└── io
│ └── github
│ └── lovelyjuice
│ ├── Main.java
│ └── ReaderPage.java
└── resources
└── META-INF
└── MANIFEST.MF
/.gitignore:
--------------------------------------------------------------------------------
1 | /*.html
2 | /*.exe
3 | /out/
4 | /target/
5 |
--------------------------------------------------------------------------------
/.idea/.gitignore:
--------------------------------------------------------------------------------
1 | # Default ignored files
2 | /shelf/
3 | /workspace.xml
4 | # Editor-based HTTP Client requests
5 | /httpRequests/
6 | # Datasource local storage ignored files
7 | /dataSources/
8 | /dataSources.local.xml
9 |
--------------------------------------------------------------------------------
/.idea/artifacts/jdread_downloader_jar.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 | $PROJECT_DIR$/out/artifacts/jdread_downloader_jar
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 |
13 |
14 |
15 |
16 |
17 |
18 |
19 |
20 |
21 |
22 |
23 |
24 |
25 |
26 |
27 |
28 |
29 |
30 |
31 |
32 |
33 |
34 |
35 |
36 |
37 |
38 |
39 |
40 |
41 |
42 |
43 |
44 |
45 |
46 |
47 |
48 |
49 |
50 |
51 |
52 |
53 |
54 |
55 |
56 |
57 |
58 |
59 |
60 |
61 |
62 |
63 |
64 |
65 |
66 |
67 |
68 |
69 |
70 |
71 |
72 |
73 |
--------------------------------------------------------------------------------
/.idea/compiler.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 |
13 |
--------------------------------------------------------------------------------
/.idea/encodings.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
--------------------------------------------------------------------------------
/.idea/jarRepositories.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 |
13 |
14 |
15 |
16 |
17 |
18 |
19 |
20 |
--------------------------------------------------------------------------------
/.idea/misc.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
10 |
11 |
12 |
13 |
14 |
--------------------------------------------------------------------------------
/.idea/vcs.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | ## 原理
2 | 通过控制浏览器翻页的方式获取书籍内容并保存到本地。
3 |
4 | ## 使用方法
5 |
6 | 1. 安装jdk,jdk版本11以上,过程略。
7 | 2. 下载ChromeDriver,和jdread-downloader.jar放在同一目录。ChromeDriver大版本要等于你安装的chrome版本,比如chrome 110就得用ChromeDriver 110版本。下载链接: https://chromedriver.storage.googleapis.com/index.html
8 | 3. 在jar文件所在目录打开终端,运行jar包:`java -jar jdread-downloader.jar`
9 | 4. 运行后会自动打开chrome,并跳转到登录页面,登录后打开书籍的第一页,向下滚动页面直至能看到“下一章”的按钮
10 | 5. 回到命令行窗口输入任意字符按回车开始爬取,爬取完成后会生成HTML页面,如果想离线保存图片的话可以用浏览器打开然后右键另存为完整的HTML文件。
11 | 6. 操作浏览器打开下一本书的封面,重复上一个步骤。
--------------------------------------------------------------------------------
/pom.xml:
--------------------------------------------------------------------------------
1 |
2 |
5 | 4.0.0
6 |
7 | io.github.lovelyjuice
8 | jdread-downloader
9 | 1.0-SNAPSHOT
10 |
11 |
12 | 11
13 | 11
14 | UTF-8
15 |
16 |
17 |
18 |
19 | org.seleniumhq.selenium
20 | selenium-java
21 | 4.4.0
22 |
23 |
24 |
25 |
--------------------------------------------------------------------------------
/src/main/java/io/github/lovelyjuice/Main.java:
--------------------------------------------------------------------------------
1 | package io.github.lovelyjuice;
2 |
3 | import org.openqa.selenium.By;
4 | import org.openqa.selenium.NoSuchElementException;
5 | import org.openqa.selenium.chrome.ChromeDriver;
6 | import org.openqa.selenium.support.ui.WebDriverWait;
7 |
8 | import java.io.FileNotFoundException;
9 | import java.io.PrintWriter;
10 | import java.time.Duration;
11 | import java.util.LinkedHashSet;
12 | import java.util.Scanner;
13 | import java.util.stream.Collectors;
14 |
15 | public class Main {
16 | public static void main(String[] args) throws FileNotFoundException {
17 | System.setProperty("webdriver.chrome.driver", "chromedriver.exe");
18 | ChromeDriver driver = new ChromeDriver();
19 | driver.get("https://passport.jd.com/new/login.aspx?ReturnUrl=https%3A%2F%2Febooks.jd.com%2Fbookshelf");
20 | System.out.println("操作浏览器登录并跳转到书籍第一页后,输入任意字符开始,输入q退出:");
21 | while (!(new Scanner(System.in).next()).equals("q")) {
22 | try {
23 | var contentBuilder = new StringBuilder("
");
24 | var styleSet = new LinkedHashSet();
25 | var readerPage = new ReaderPage(driver);
26 | var bookName = readerPage.bookName.getAttribute("innerText");
27 | System.out.printf("开始爬取《%s》%n", bookName);
28 | int debugTimes = 0;
29 | String debugFlag = System.getenv("debug");
30 | while (true) {
31 | if (debugTimes++ > 6 && debugFlag != null) break; //调试的时候只爬取6章
32 | String contentHtml = readerPage.content.getAttribute("outerHTML");
33 | System.out.println(contentHtml.split("reader-chapter-content")[1].substring(0, 50));
34 | contentBuilder.append(contentHtml);
35 | styleSet.addAll(readerPage.styleSheetList.stream() //有些书不同章节有不同的css样式,所以试着合并这些css
36 | .map(webElement -> webElement.getAttribute("outerHTML"))
37 | .collect(Collectors.toList()));
38 | System.out.println("-----------------------");
39 | try {
40 | readerPage.nextChapterMiddleButton.click();
41 | } catch (NoSuchElementException e) {
42 | break; //找不到“下一章”按钮说明已经浏览到最后一章
43 | }
44 | new WebDriverWait(driver, Duration.ofSeconds(300))
45 | .until(a -> a.findElement(By.cssSelector("div.reader-chapter-content"))); //“下一章”按钮在正文之前被渲染出来,所以只要渲染出按钮就可以跳转到下一章
46 | }
47 | contentBuilder.append("