├── .dockerignore
├── .gitignore
├── Dockerfile
├── LICENSE
├── README.md
├── pom.xml
└── src
    └── main
        └── java
            ├── Starter.java
            ├── cn
                └── chineseall
                │   ├── Batch.java
                │   ├── Book.java
                │   ├── Class.java
                │   ├── CoreService.java
                │   ├── Downloader.java
                │   ├── FixWhiteSpace.java
                │   ├── Node.java
                │   ├── PDFInfo.java
                │   ├── PDFReader.java
                │   ├── Tmp.java
                │   └── yus.java
            ├── com
                ├── njulib
                │   ├── Start.java
                │   ├── fix
                │   │   ├── FileRenamer.java
                │   │   ├── ListBook.java
                │   │   └── MissingPageCompletion.java
                │   ├── object
                │   │   ├── Book.java
                │   │   ├── BookClass.java
                │   │   ├── Books.java
                │   │   ├── InfoReader.java
                │   │   ├── RootBookClass.java
                │   │   ├── TerminalBookClass.java
                │   │   └── exception
                │   │   │   ├── BookDLException.java
                │   │   │   ├── BookPagesDLException.java
                │   │   │   └── PageDLException.java
                │   └── spider
                │   │   ├── BookDownloader.java
                │   │   ├── BookSearch.java
                │   │   └── NJULib.java
                └── sslibrary
                │   ├── Start.java
                │   ├── fix
                │       ├── FileRenamer.java
                │       ├── MissingPageCompletion.java
                │       └── Recovery.java
                │   ├── object
                │       ├── Book.java
                │       ├── BookClass.java
                │       ├── Books.java
                │       ├── InfoReader.java
                │       ├── RootBookClass.java
                │       ├── TerminalBookClass.java
                │       └── exception
                │       │   ├── BookDLException.java
                │       │   ├── BookPagesDLException.java
                │       │   └── PageDLException.java
                │   └── spider
                │       ├── BookDownloader.java
                │       ├── BookSearch.java
                │       ├── NJULib.java
                │       └── PDFGenerator.java
            └── utils
                ├── ImageMeger.java
                ├── conversion
                    ├── MyDecoder.java
                    ├── PDFMerge.java
                    └── PDFTool.java
                └── network
                    ├── MyByteArray.java
                    ├── MyHttpRequest.java
                    └── ReturnData.java


/.dockerignore:
--------------------------------------------------------------------------------
1 | Dockerfile


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | target/
 2 | !.mvn/wrapper/maven-wrapper.jar
 3 | 
 4 | ### STS ###
 5 | .apt_generated
 6 | .classpath
 7 | .factorypath
 8 | .project
 9 | .settings
10 | .springBeans
11 | 
12 | ### IntelliJ IDEA ###
13 | .idea
14 | *.iws
15 | *.iml
16 | *.ipr
17 | 
18 | ### NetBeans ###
19 | nbproject/private/
20 | build/
21 | nbbuild/
22 | dist/
23 | nbdist/
24 | .nb-gradle/


--------------------------------------------------------------------------------
/Dockerfile:
--------------------------------------------------------------------------------
 1 | FROM maven:3-jdk-8
 2 | 
 3 | WORKDIR /code
 4 | ADD . .
 5 | RUN mvn package
 6 | RUN mv target/libpdf*-dependencies.jar target/libpdf.jar
 7 | 
 8 | WORKDIR /ebook
 9 | 
10 | ENTRYPOINT [ "java", "-jar", "/code/target/libpdf.jar"]
11 | 
12 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # NJU-lib-Downloader
 2 | [超星电子书](http://www.sslibrary.com/)和[书香中国](http://sxnju.chineseall.cn/home/index)的电子书下载器
 3 | 
 4 | 命令行程序。可以下载图书并自动合成PDF。
 5 | 
 6 | ## 依赖
 7 | * Java 8 +
 8 | 
 9 | ## 使用方法
10 | 在 [release](https://github.com/padeoe/nju-lib-downloader/releases) 中下载发布的 jar 包，执行以下命令：
11 | ```
12 | 用法: java -jar libpdf.jar [-c=<tmpPath>] [-p=<outputPath>] [-t=<threadNumber>] URL
13 |       URL                   书籍链接
14 |   -c, --cache_path=<tmpPath>
15 |                             临时文件（分页pdf）存储路径，默认为当前路径
16 |   -p, --path=<outputPath>   pdf存储目录，默认为当前路径
17 |   -t=<threadNumber>         线程数量，默认为8
18 | 
19 | 示例: java -jar libpdf.jar -t 8 http://sxnju.chineseall.cn/v3/book/detail/VPeZj
20 |       java -jar libpdf.jar -t 8 -p /home/pdf/ -c /tmp/pdf http://img.sslibrary.com/n/slib/book/slib/10649113/65873989af6f4d809862aa11b16f650c/0e71a4d58ffba4e1b202d4b3fb30a81a.shtml?dxbaoku=false&deptid=275&fav=http%3A%2F%2Fwww.sslibrary.com%2Freader%2Fpdg%2Fpdgreader%3Fd%3Da1b248ecb4a78ba2087d8b5d0c5c950d%26ssid%3D10649113&fenlei=080401&spage=1&t=5&username=xxxxxx&view=-1
21 | 
22 | ```
23 | 
24 | ### Docker
25 | 
26 | ```
27 | docker run --rm -v "$PWD":/ebook padeoe/nju-lib-downloader url
28 | ```
29 | 
30 | <h2>特别感谢</h2>
31 | 
32 | [@Nifury](https://github.com/Nifury)
33 | 


--------------------------------------------------------------------------------
/pom.xml:
--------------------------------------------------------------------------------
  1 | <?xml version="1.0" encoding="UTF-8"?>
  2 | <project xmlns="http://maven.apache.org/POM/4.0.0"
  3 |          xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
  4 |          xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
  5 |     <modelVersion>4.0.0</modelVersion>
  6 | 
  7 |     <groupId>com.padeoe.nju</groupId>
  8 |     <artifactId>libpdf</artifactId>
  9 |     <version>0.2.11</version>
 10 | 
 11 |     <properties>
 12 |         <maven.compiler.source>1.8</maven.compiler.source>
 13 |         <maven.compiler.target>1.8</maven.compiler.target>
 14 |     </properties>
 15 |     <build>
 16 |         <plugins>
 17 |             <plugin>
 18 |                 <artifactId>maven-assembly-plugin</artifactId>
 19 |                 <executions>
 20 |                     <execution>
 21 |                         <phase>package</phase>
 22 |                         <goals>
 23 |                             <goal>single</goal>
 24 |                         </goals>
 25 |                     </execution>
 26 |                 </executions>
 27 |                 <configuration>
 28 |                     <descriptorRefs>
 29 |                         <descriptorRef>jar-with-dependencies</descriptorRef>
 30 |                     </descriptorRefs>
 31 |                     <archive>
 32 |                         <manifest>
 33 |                             <mainClass>Starter</mainClass>
 34 |                         </manifest>
 35 |                     </archive>
 36 |                 </configuration>
 37 |             </plugin>
 38 |         </plugins>
 39 |     </build>
 40 |     <dependencies>
 41 |         <dependency>
 42 |             <groupId>junit</groupId>
 43 |             <artifactId>junit</artifactId>
 44 |             <version>RELEASE</version>
 45 |         </dependency>
 46 |         <dependency>
 47 |             <groupId>org.jsoup</groupId>
 48 |             <artifactId>jsoup</artifactId>
 49 |             <version>1.8.3</version>
 50 |         </dependency>
 51 |         <!-- always needed -->
 52 |         <dependency>
 53 |             <groupId>com.itextpdf</groupId>
 54 |             <artifactId>kernel</artifactId>
 55 |             <version>7.1.0</version>
 56 |         </dependency>
 57 | 
 58 |         <!-- always needed -->
 59 |         <dependency>
 60 |             <groupId>com.itextpdf</groupId>
 61 |             <artifactId>io</artifactId>
 62 |             <version>7.1.0</version>
 63 |         </dependency>
 64 | 
 65 |         <!-- always needed -->
 66 |         <dependency>
 67 |             <groupId>com.itextpdf</groupId>
 68 |             <artifactId>layout</artifactId>
 69 |             <version>7.1.0</version>
 70 |         </dependency>
 71 | 
 72 |         <!-- only needed for forms -->
 73 |         <dependency>
 74 |             <groupId>com.itextpdf</groupId>
 75 |             <artifactId>forms</artifactId>
 76 |             <version>7.1.0</version>
 77 |         </dependency>
 78 | 
 79 |         <!-- only needed for PDF/A -->
 80 |         <dependency>
 81 |             <groupId>com.itextpdf</groupId>
 82 |             <artifactId>pdfa</artifactId>
 83 |             <version>7.1.0</version>
 84 |         </dependency>
 85 | 
 86 |         <!-- only needed for digital signatures -->
 87 |         <dependency>
 88 |             <groupId>com.itextpdf</groupId>
 89 |             <artifactId>sign</artifactId>
 90 |             <version>7.1.0</version>
 91 |         </dependency>
 92 | 
 93 |         <!-- only needed for Asian fonts -->
 94 |         <dependency>
 95 |             <groupId>com.itextpdf</groupId>
 96 |             <artifactId>font-asian</artifactId>
 97 |             <version>7.1.0</version>
 98 |         </dependency>
 99 | 
100 |         <dependency>
101 |             <groupId>com.itextpdf</groupId>
102 |             <artifactId>pdftest</artifactId>
103 |             <version>7.0.2</version>
104 |         </dependency>
105 | 
106 |         <dependency>
107 |             <groupId>org.zeroturnaround</groupId>
108 |             <artifactId>zt-exec</artifactId>
109 |             <version>1.9</version>
110 |         </dependency>
111 | 
112 |         <!-- https://mvnrepository.com/artifact/org.apache.pdfbox/pdfbox -->
113 |         <dependency>
114 |             <groupId>org.apache.pdfbox</groupId>
115 |             <artifactId>pdfbox</artifactId>
116 |             <version>2.0.16</version>
117 |         </dependency>
118 |         <!-- https://mvnrepository.com/artifact/org.apache.pdfbox/pdfbox-tools -->
119 |         <dependency>
120 |             <groupId>org.apache.pdfbox</groupId>
121 |             <artifactId>pdfbox-tools</artifactId>
122 |             <version>2.0.16</version>
123 |         </dependency>
124 | 
125 |         <!-- https://mvnrepository.com/artifact/com.fasterxml.jackson.core/jackson-core -->
126 |         <dependency>
127 |             <groupId>com.fasterxml.jackson.core</groupId>
128 |             <artifactId>jackson-core</artifactId>
129 |             <version>2.10.1</version>
130 |         </dependency>
131 | 
132 |         <!-- https://mvnrepository.com/artifact/com.fasterxml.jackson.core/jackson-databind -->
133 |         <dependency>
134 |             <groupId>com.fasterxml.jackson.core</groupId>
135 |             <artifactId>jackson-databind</artifactId>
136 |             <version>2.10.1</version>
137 |         </dependency>
138 | 
139 |         <!-- https://mvnrepository.com/artifact/info.picocli/picocli -->
140 |         <dependency>
141 |             <groupId>info.picocli</groupId>
142 |             <artifactId>picocli</artifactId>
143 |             <version>4.1.4</version>
144 |         </dependency>
145 | 
146 | 
147 |     </dependencies>
148 | 
149 | 
150 | </project>
151 | 


--------------------------------------------------------------------------------
/src/main/java/Starter.java:
--------------------------------------------------------------------------------
 1 | import com.sslibrary.spider.BookDownloader;
 2 | import picocli.CommandLine;
 3 | 
 4 | import java.nio.file.Paths;
 5 | 
 6 | /**
 7 |  * Created by padeoe on 2017/9/8.
 8 |  */
 9 | public class Starter implements Runnable {
10 |     @CommandLine.Option(names = {"-t"}, description = "线程数量")
11 |     private int threadNumber = 8;
12 | 
13 |     @CommandLine.Parameters(paramLabel = "URL", description = "书籍链接")
14 |     private String url;
15 | 
16 |     @CommandLine.Option(names = {"-p", "--path"}, description = "pdf存储目录")
17 |     private String outputPath;
18 | 
19 |     @CommandLine.Option(names = {"-c", "--cache_path"}, description = "临时文件（分页pdf）存储路径")
20 |     private String tmpPath;
21 | 
22 |     public static void main(String[] args) {
23 |         System.setProperty("sun.java2d.cmm", "sun.java2d.cmm.kcms.KcmsServiceProvider");
24 |         int exitCode = new CommandLine(new Starter()).execute(args);
25 |         System.exit(exitCode);
26 |     }
27 | 
28 |     @Override
29 |     public void run() {
30 | 
31 |         try {
32 |             long begin = System.currentTimeMillis();
33 |             if (url.contains("chineseall.cn")) {
34 |                 String[] segments = url.split("/");
35 |                 String bookId = segments[segments.length - 1];
36 |                 cn.chineseall.Downloader bookDownloader = new cn.chineseall.Downloader(bookId, new cn.chineseall.CoreService("Maskeney", "147258"));
37 |                 bookDownloader.setThreadNumber(threadNumber);
38 |                 if (tmpPath != null) bookDownloader.setTmpPathDir(Paths.get(tmpPath));
39 |                 if (outputPath != null) bookDownloader.setPath(Paths.get(outputPath));
40 |                 bookDownloader.downloadBook();
41 |             } else {
42 |                 if (url.contains("img.sslibrary.com")) {
43 |                     BookDownloader bookDownloader = new BookDownloader(url);
44 |                     bookDownloader.setThreadNumber(threadNumber);
45 |                     if (outputPath != null) bookDownloader.setPath(outputPath);
46 |                     if (tmpPath != null) bookDownloader.setTmpPath(Paths.get(tmpPath));
47 |                     bookDownloader.downloadBook();
48 |                 } else {
49 |                     System.err.println("未能识别的url，请输入chineseall.cn或者img.sslibrary.com开头的书本url");
50 |                 }
51 |             }
52 |             System.out.println("下载结束，耗时" + (System.currentTimeMillis() - begin) / 1000 + "秒");
53 |         } catch (Exception e) {
54 |             e.printStackTrace();
55 |         }
56 |     }
57 | }
58 | 


--------------------------------------------------------------------------------
/src/main/java/cn/chineseall/Batch.java:
--------------------------------------------------------------------------------
 1 | package cn.chineseall;
 2 | 
 3 | import java.io.IOException;
 4 | import java.nio.file.Files;
 5 | import java.nio.file.Paths;
 6 | import java.util.stream.Stream;
 7 | 
 8 | /**
 9 |  * Created by padeoe on 2017/4/14.
10 |  */
11 | public class Batch {
12 |     public static void main(String[] args) {
13 |         String className="D9";
14 |         if(args!=null&&args.length>0){
15 |             className=args[0];
16 |         }
17 |         try {
18 |             Stream<String> bookStream=new Class(className).getNewBooks().flatMap(books -> books.stream().map(Book::toString));
19 |             //bookStream.forEach(book-> System.out.println(book));
20 |             Files.write(Paths.get("C:\\Users\\padeo\\Desktop\\法律书籍.txt"), (Iterable<String>)bookStream::iterator);
21 | /*            System.out.println(className+"分类共"+books.size()+"本书");
22 |             String finalClassName = className;
23 |             books.parallelStream().*//*filter(book -> book.getAuthor().indexOf("(美")!=-1||book.getAuthor().indexOf("[美")!=-1).*//*forEach(book -> {
24 |                 Downloader bookDownloader = new Downloader(book, new CoreService("", ""));
25 |                 bookDownloader.setPath(Paths.get("/mnt/f/"+ finalClassName));
26 |                 bookDownloader.setTmpPathDir(Paths.get("/mnt/f/tmp"));
27 |                 bookDownloader.setThreadNumber(2);
28 |                 if(!bookDownloader.downloadBook()){
29 |                     BookDownloader.writeFile("/mnt/f/error.txt",book.getId()+" "+book.getName());
30 |                 }
31 |             });*/
32 |         } catch (IOException e) {
33 |             e.printStackTrace();
34 |         }
35 | 
36 |     }
37 | }
38 | 


--------------------------------------------------------------------------------
/src/main/java/cn/chineseall/Book.java:
--------------------------------------------------------------------------------
  1 | package cn.chineseall;
  2 | 
  3 | import com.fasterxml.jackson.databind.ObjectMapper;
  4 | import com.fasterxml.jackson.databind.node.ObjectNode;
  5 | import org.jsoup.Jsoup;
  6 | import org.jsoup.nodes.Document;
  7 | import org.jsoup.nodes.Element;
  8 | import org.jsoup.select.Elements;
  9 | import utils.network.MyHttpRequest;
 10 | 
 11 | import java.io.IOException;
 12 | import java.util.ArrayList;
 13 | import java.util.LinkedList;
 14 | import java.util.List;
 15 | 
 16 | /**
 17 |  * Created by padeoe on 2017/4/10.
 18 |  */
 19 | public class Book {
 20 |     String id;
 21 |     String idInt;
 22 |     String name;
 23 |     String press;
 24 |     String author;
 25 |     String publishDate;
 26 |     String introduction;
 27 |     String coverUrl;
 28 | 
 29 |     public String getId() {
 30 |         return id;
 31 |     }
 32 | 
 33 |     public void setId(String id) {
 34 |         this.id = id;
 35 |     }
 36 | 
 37 |     public String getIdInt() {
 38 |         return idInt;
 39 |     }
 40 | 
 41 |     public void setIdInt(String idInt) {
 42 |         this.idInt = idInt;
 43 |     }
 44 | 
 45 |     public String getName() {
 46 |         return name;
 47 |     }
 48 | 
 49 |     public void setName(String name) {
 50 |         this.name = name;
 51 |     }
 52 | 
 53 |     public String getPress() {
 54 |         return press;
 55 |     }
 56 | 
 57 |     public void setPress(String press) {
 58 |         this.press = press;
 59 |     }
 60 | 
 61 |     public String getAuthor() {
 62 |         return author;
 63 |     }
 64 | 
 65 |     public void setAuthor(String author) {
 66 |         this.author = author;
 67 |     }
 68 | 
 69 |     public String getPublishDate() {
 70 |         return publishDate;
 71 |     }
 72 | 
 73 |     public void setPublishDate(String publishDate) {
 74 |         this.publishDate = publishDate;
 75 |     }
 76 | 
 77 |     public String getIntroduction() {
 78 |         return introduction;
 79 |     }
 80 | 
 81 |     public void setIntroduction(String introduction) {
 82 |         this.introduction = introduction;
 83 |     }
 84 | 
 85 |     public String getCoverUrl() {
 86 |         return coverUrl;
 87 |     }
 88 | 
 89 |     public void setCoverUrl(String coverUrl) {
 90 |         this.coverUrl = coverUrl;
 91 |     }
 92 | 
 93 |     public Book(String id) {
 94 |         this.id = id;
 95 |     }
 96 | 
 97 |     public Book(String id, String name, String press, String author, String publishDate, String introduction, String coverUrl) {
 98 |         this.id = id;
 99 |         this.name = name;
100 |         this.press = press;
101 |         this.author = author;
102 |         this.publishDate = publishDate;
103 |         this.introduction = introduction;
104 |         this.coverUrl = coverUrl;
105 |     }
106 | 
107 |     @Override
108 |     public String toString() {
109 |         return "Book{" +
110 |                 "id='" + id + '\'' +
111 |                 ", name='" + name + '\'' +
112 |                 ", press='" + press + '\'' +
113 |                 ", author='" + author + '\'' +
114 |                 ", publishDate='" + publishDate + '\'' +
115 |                 ", introduction='" + introduction + '\'' +
116 |                 ", coverUrl='" + coverUrl + '\'' +
117 |                 '}';
118 |     }
119 | 
120 |     public List<Node> getOutline() throws IOException {
121 |         for (int i = 0; i < 20; i++) {
122 |             try {
123 |                 String url = CoreService.baseUrl + "/book/getDirectoryTree.jsps?bookId=" + idInt + "&type=PDF";
124 |                 //http://sxnju.chineseall.cn/book/getDirectoryTree.jsps?bookId=10060602592&type=PDF&_=1504844448871
125 |                 String result = MyHttpRequest.get(url, null, "UTF-8", 3000);
126 | 
127 |                 result = new ObjectMapper().readValue(result, ObjectNode.class).get("data").textValue();
128 | 
129 |                 Document doc = Jsoup.parse(result);
130 |                 Elements elements = doc.select("ul[id=directoryTree]");
131 |                 return parseUL(elements.get(0));
132 |             } catch (Exception e) {
133 |                 if (i == 19) {
134 |                     throw e;
135 |                 }
136 |             }
137 | 
138 |         }
139 |         return null;
140 |     }
141 | 
142 |     protected List<Node> parseUL(Element element) {
143 |         List<Node> nodes = new LinkedList<>();
144 |         for (int i = 0; i < element.children().size(); i++) {
145 |             Element child = element.child(i);
146 |             if (child.nodeName().equals("li")) {
147 |                 nodes.add(parseLi(child));
148 |             }
149 |         }
150 |         return nodes;
151 |     }
152 | 
153 |     protected Node parseLi(Element liElement) {
154 |         Elements children = liElement.children();
155 |         if (children.size() == 1 && children.get(0).nodeName().equals("a")) {
156 |             return parseA(children.get(0));
157 |         }
158 |         Node root = new Node();
159 |         for (Element child : liElement.children()) {
160 |             if (child.nodeName().equals("span")) {
161 |                 root = parseSpan(child);
162 |             }
163 |             if (child.nodeName().equals("ul")) {
164 |                 root.addAll(parseUL(child));
165 |             }
166 |         }
167 |         return root;
168 |     }
169 | 
170 |     protected Node parseSpan(Element spanElement) {
171 |         if (spanElement.children() != null) {
172 |             Element trueNode = spanElement.child(0);
173 |             return parseA(trueNode);
174 |         }
175 |         return new Node();
176 |     }
177 | 
178 |     protected Node parseA(Element aElement) {
179 |         Node result = new Node();
180 |         String nodeTitle = aElement.text();
181 | 
182 |         result.setTitle(nodeTitle);
183 |         result.setPage(Integer.parseInt(aElement.attr("rel")));
184 |         return result;
185 |     }
186 | 
187 |     public static List<Book> getBookFromHTML(String html) {
188 |         Document doc = Jsoup.parse(html);
189 |         Elements infoNode = doc.select("div[class=boxListLi5]");
190 |         List<Book> books = new ArrayList<>(30);
191 |         if (infoNode != null) {
192 |             for (int i = 0; i < infoNode.size(); i++) {
193 |                 String id = null, name = null, author = null, publishDate = null, press = null, introduction = null, coverUrl = null;
194 |                 Elements idNameNode = infoNode.get(i).select("a[href][title]");
195 |                 if (idNameNode != null && idNameNode.size() > 0) {
196 |                     Elements coverImageNode = infoNode.get(i).select("img[src]");
197 |                     if (coverImageNode != null && coverImageNode.size() > 0) {
198 |                         coverUrl = coverImageNode.attr("src");
199 |                     }
200 |                     name = idNameNode.get(0).attr("title");
201 |                     id = idNameNode.get(0).attr("href");
202 |                     int id_index = id.indexOf("/book/detail/");
203 |                     if (id_index != -1) {
204 |                         id = id.substring(id_index + "/book/detail/".length(), id.length());
205 |                     }
206 |                     Elements pressNode = infoNode.get(i).select("span");
207 |                     if (pressNode != null && pressNode.size() > 0) {
208 |                         String pressInfo = pressNode.get(0).text();
209 |                         if (pressInfo != null) {
210 |                             String[] pressInfoArray = pressInfo.split("/");
211 |                             if (pressInfoArray != null && pressInfoArray.length == 3) {
212 |                                 author = pressInfoArray[0].trim();
213 |                                 press = pressInfoArray[1].trim();
214 |                                 publishDate = pressInfoArray[2].trim();
215 |                             }
216 |                         }
217 |                     }
218 |                     Elements introNode = infoNode.get(i).select("p");
219 |                     if (introNode != null && introNode.size() > 0) {
220 |                         introduction = introNode.text();
221 |                     }
222 |                 }
223 |                 if (id != null) {
224 |                     Book book = new Book(id, name, press, author, publishDate, introduction, coverUrl);
225 |                     books.add(book);
226 | //                    System.out.println(book);
227 |                 }
228 |             }
229 |         }
230 |         return books;
231 |     }
232 | 
233 | }
234 | 


--------------------------------------------------------------------------------
/src/main/java/cn/chineseall/Class.java:
--------------------------------------------------------------------------------
  1 | package cn.chineseall;
  2 | 
  3 | import org.jsoup.Jsoup;
  4 | import org.jsoup.nodes.Document;
  5 | import org.jsoup.select.Elements;
  6 | import utils.network.MyHttpRequest;
  7 | 
  8 | import java.io.FileWriter;
  9 | import java.io.IOException;
 10 | import java.util.HashSet;
 11 | import java.util.List;
 12 | import java.util.Set;
 13 | import java.util.concurrent.atomic.AtomicInteger;
 14 | import java.util.function.Function;
 15 | import java.util.stream.IntStream;
 16 | import java.util.stream.Stream;
 17 | 
 18 | import static cn.chineseall.Book.getBookFromHTML;
 19 | 
 20 | /**
 21 |  * Created by padeoe on 2017/4/11.
 22 |  */
 23 | public class Class {
 24 |     private AtomicInteger needGet = new AtomicInteger(1);
 25 |     private String id;
 26 |     public Class(String id){
 27 |         this.id=id;
 28 |     }
 29 |     public int getBookSize() throws IOException {
 30 |         String url= CoreService.baseUrl+"/org/show/sort/"+id+"/0";
 31 |         String result= MyHttpRequest.get(url,null,"UTF-8",3000);
 32 |         return getBookSizeFromHtml(result);
 33 |     }
 34 | 
 35 |     public static int getBookSizeFromHtml(String html){
 36 |         Document doc= Jsoup.parse(html);
 37 |         Elements sizeNode=doc.select("input[id=totalSize]");
 38 |         if(sizeNode!=null&&sizeNode.size()>0){
 39 |             String sizeString=sizeNode.attr("value");
 40 |             if(sizeString!=null){
 41 |                 int sizeInt= Integer.parseInt(sizeString);
 42 |                 return sizeInt;
 43 |             }
 44 |         }
 45 |         return -1;
 46 |     }
 47 |     public List<Book> getBooks(int page) {
 48 |         String url= CoreService.baseUrl+"/org/show/sort/"+id+"/"+page;
 49 |         String result= null;
 50 |         try {
 51 |             result = MyHttpRequest.get(url,null,"UTF-8",3000);
 52 |         } catch (IOException e) {
 53 |             e.printStackTrace();
 54 |         }
 55 |         return getBookFromHTML(result);
 56 |     }
 57 | 
 58 |     public List<Book> getNewBooks(int page) {
 59 |         String url= CoreService.baseUrl+"/org/show/sort/"+this.id+"/"+page;
 60 |         String result= null;
 61 |         try {
 62 |             result = MyHttpRequest.get(url,null,"UTF-8",3000);
 63 |         } catch (IOException e) {
 64 |             e.printStackTrace();
 65 |         }
 66 |         return getBookFromHTML(result);
 67 |     }
 68 | 
 69 |     public Stream<List<Book>> getNewBooks() throws IOException {
 70 |         int size= getBookSizeFromHtml(MyHttpRequest.get(CoreService.baseUrl+"/org/show/sort/"+this.id+"/0",null,"UTF-8",3000));
 71 |         int lastPage = size / 30 + 1;//最后一页的页码
 72 |         return IntStream.range(0,lastPage+1).parallel().mapToObj(page -> getNewBooks(page));
 73 |     }
 74 | 
 75 | 
 76 | 
 77 |     public Stream<List<Book>> getAllBooks() throws IOException {
 78 |         int size= getBookSize();
 79 |         int lastPage = size / 30 + 1;//最后一页的页码
 80 |         return IntStream.range(0,lastPage+1).parallel().mapToObj(page -> getBooks(page));
 81 | /*        int threadNumber=10;
 82 | 
 83 |             Set<Book> books = new HashSet<>();
 84 |             List<PageGetThread> threadList = new ArrayList<>();
 85 | 
 86 |             AtomicInteger needGettedPage = new AtomicInteger(0);//需要获取的页码
 87 | 
 88 |             //开始多线程刷所有页码
 89 |             for (int threadN = 0; threadN < threadNumber; threadN++) {
 90 |                 threadList.add(new PageGetThread(needGettedPage, lastPage));
 91 |             }
 92 | 
 93 |             for (PageGetThread thread : threadList) {
 94 |                 thread.start();
 95 |             }
 96 |             for (PageGetThread thread : threadList) {
 97 |                 try {
 98 |                     thread.join();
 99 |                 } catch (InterruptedException e) {
100 |                     e.printStackTrace();
101 |                 }
102 |             }
103 |             threadList.forEach(pageGetThread -> books.addAll(pageGetThread.getThreadBooks()));
104 |             return books;*/
105 |     }
106 | 
107 |     /**
108 |      * 获取所有图书列表的线程
109 |      */
110 |     class PageGetThread extends Thread {
111 |         Set<Book> books = new HashSet<>();
112 |         AtomicInteger needGettedPage;
113 |         int lastPage;
114 | 
115 |         public PageGetThread(AtomicInteger needGettedPage, int lastPage) {
116 |             this.needGettedPage = needGettedPage;
117 |             this.lastPage = lastPage;
118 |         }
119 | 
120 |         @Override
121 |         public void run() {
122 |             while (true) {
123 |                 int gettingpage = needGettedPage.getAndIncrement();
124 |                 if (gettingpage <= lastPage) {
125 |                  //   try {
126 |                      //  System.out.println("正在获取第"+gettingpage+"页");
127 |                         books.addAll(getBooks(gettingpage));
128 | /*                    } catch (IOException e) {
129 |                         e.printStackTrace();
130 |                     }*/
131 |                 } else {
132 |                     break;
133 |                 }
134 |             }
135 |         }
136 | 
137 |         public Set<Book> getThreadBooks() {
138 |             return books;
139 |         }
140 |     }
141 | 
142 |     private static StringBuffer output = new StringBuffer("<html>\n" +
143 |             "<head><meta charset='UTF-8'></head>" +
144 |             "<table border=\"1\">\n" +
145 |             "<tr>\n" +
146 |            /* "  <th>编号</th>\n" +*/
147 |             "  <th>书名</th>\n" +
148 |             "  <th>作者</th>\n" +
149 |             "  <th>出版年份</th>\n" +
150 |             "  <th>出版社</th>\n" +
151 |             "</tr>\n");
152 | 
153 |     private static String getBookLineInTable(Book book) {
154 |         if (book != null) {
155 |             StringBuffer stringBuffer = new StringBuffer();
156 |             stringBuffer.append("<tr>\n");
157 |             stringBuffer.append("<td><a target=\"_blank\" href=\"http://sxqh.chineseall.cn/v3/book/detail/"+book.getId()+"\">"+book.getName()+"</a></td>");
158 | //            stringBuffer.append(getAttr(Book::getId, book));
159 |             /*stringBuffer.append(getAttr(Book::getName, book));*/
160 |             stringBuffer.append(getAttr(Book::getAuthor, book));
161 |             stringBuffer.append(getAttr(Book::getPublishDate, book));
162 |             stringBuffer.append(getAttr(Book::getPress, book));
163 |             stringBuffer.append("</tr>");
164 |             return stringBuffer.toString();
165 |         } else {
166 |             return null;
167 |         }
168 | 
169 |     }
170 | 
171 |     private static String getAttr(Function<Book, String> attrGetter, Book book) {
172 |         StringBuffer stringBuffer = new StringBuffer();
173 |         stringBuffer.append("<td>");
174 |         stringBuffer.append(attrGetter.apply(book));
175 |         stringBuffer.append("</td>\n");
176 |         return stringBuffer.toString();
177 |     }
178 |     public static void main(String[] args) {
179 |         try {
180 | 
181 |             new Class("D9").getAllBooks().forEach(bookList ->
182 |                     bookList.forEach(book -> {
183 |                         output.append(getBookLineInTable(book));
184 |                     })
185 |             );
186 |             output.append("</table>\n");
187 |             output.append("</html>");
188 |             FileWriter writer = null;
189 |             try {
190 |                 writer = new FileWriter("D9.html", false);
191 |                 writer.write(output.toString());
192 |                 writer.close();
193 |             } catch (IOException e) {
194 |                 e.printStackTrace();
195 |             }
196 | 
197 |         } catch (IOException e) {
198 |             e.printStackTrace();
199 |         }
200 |     }
201 | 
202 | }
203 | 


--------------------------------------------------------------------------------
/src/main/java/cn/chineseall/CoreService.java:
--------------------------------------------------------------------------------
 1 | package cn.chineseall;
 2 | 
 3 | import java.io.IOException;
 4 | import java.io.OutputStream;
 5 | import java.net.*;
 6 | import java.util.HashMap;
 7 | import java.util.List;
 8 | import java.util.Map;
 9 | 
10 | /**
11 |  * Created by padeoe on 2017/4/10.
12 |  */
13 | public class CoreService {
14 |     private String username;
15 |     private String password;
16 |     public static final String baseUrl = "http://sxqh.chineseall.cn";
17 |     public CoreService(String username, String password){
18 |         this.username=username;
19 |         this.password=password;
20 |     }
21 |     public  String getSession() throws IOException {
22 |         Map<String, String> attr = new HashMap<>();
23 |         attr.put("Referer", baseUrl+"/sso/login.jsps?redirectUrl="+baseUrl);
24 |         attr.put("Origin", baseUrl);
25 |         String result = getCookie("userName=" + username + "&userPass=" + password + "&redirectUrl="+ URLEncoder.encode(baseUrl), baseUrl + "/sso/logon.jsps", attr, "UTF-8", 3000);
26 |         return result;
27 |     }
28 | 
29 |     private  String getCookie(String data, String URL, Map<String, String> requestProperty,String inputEncoding, int timeout) throws IOException {
30 |         byte[] dataAsBytes = new byte[]{};
31 |         if (data != null) {
32 |             dataAsBytes = data.getBytes(inputEncoding);
33 |         }
34 |         java.net.URL url = new URL(URL);
35 |         HttpURLConnection connection = (HttpURLConnection) url
36 |                 .openConnection(/*new Proxy(Proxy.Type.SOCKS, new InetSocketAddress("127.0.0.1", 1080))*/);
37 |         connection.setConnectTimeout(timeout);
38 |         connection.setRequestMethod("POST");
39 |         connection.setDoOutput(true);
40 | 
41 |         if (requestProperty != null) {
42 |             for (Map.Entry<String, String> entry : requestProperty.entrySet()) {
43 |                 connection.setRequestProperty(entry.getKey(), entry.getValue());
44 |             }
45 |         }
46 |         if (data != null) {
47 |             connection.setRequestProperty("Content-Length", String.valueOf(dataAsBytes.length));
48 |         }
49 |         connection.setInstanceFollowRedirects(false);
50 |         connection.connect();
51 |         if (data != null) {
52 |             OutputStream outputStream = null;
53 |             try {
54 |                 outputStream = connection.getOutputStream();
55 |                 outputStream.write(dataAsBytes);
56 |             } finally {
57 |                 if (outputStream != null) {
58 |                     outputStream.close();
59 |                 }
60 | 
61 |             }
62 |         }
63 |         utils.network.MyByteArray myByteArray = new utils.network.MyByteArray();
64 |         Map<String, List<String>> headers = connection.getHeaderFields();
65 | 
66 | 
67 |         connection.disconnect();
68 |         byte[] bytes = new byte[myByteArray.getSize()];
69 |         System.arraycopy(myByteArray.getBuffer(), 0, bytes, 0, bytes.length);
70 |         return headers.get("Set-Cookie").get(0);
71 |     }
72 | }
73 | 


--------------------------------------------------------------------------------
/src/main/java/cn/chineseall/FixWhiteSpace.java:
--------------------------------------------------------------------------------
 1 | package cn.chineseall;
 2 | 
 3 | import java.io.File;
 4 | import java.io.IOException;
 5 | import java.io.RandomAccessFile;
 6 | import java.nio.file.Files;
 7 | import java.util.LinkedList;
 8 | import java.util.List;
 9 | 
10 | /**
11 |  * Created by padeoe on 2017/4/23.
12 |  */
13 | public class FixWhiteSpace {
14 |     public static void main(String[] args) {
15 |         long begin = System.currentTimeMillis();
16 |         List<File> allDir = getAllDir(new File(args[0]));
17 |         System.out.println("总数本数 "+allDir.size());
18 |         allDir.parallelStream().forEach(file -> handleDir(file));
19 |         System.out.println((System.currentTimeMillis() - begin) );
20 |     }
21 | 
22 |     public static List<File> getAllDir(File rootDir){
23 |         if(rootDir.isFile()){
24 |             return null;
25 |         }
26 |         List<File> result=new LinkedList<>();
27 |         for (File subDir:rootDir.listFiles()){
28 |             if(subDir.getName().startsWith("《")){
29 |                 result.add(subDir);
30 |             }
31 |             else {
32 |                 result.addAll(getAllDir(subDir));
33 |             }
34 |         }
35 |         return result;
36 |     }
37 | 
38 | 
39 |     public static void handleDir(File dir){
40 |         File []files=dir.listFiles();
41 |         if(files.length>0){
42 |             if(!files[0].getName().endsWith(".txt")){
43 |                 if(files[0].length()%1024==0){
44 |                     System.out.println(dir.getName());
45 |                     fixDir(dir);
46 |                 }
47 |             }
48 |             else{
49 |                 if(files[1].length()%1024==0){
50 |                     System.out.println(dir.getName());
51 |                     fixDir(dir);
52 |                 }
53 |             }
54 |         }
55 |         else{
56 |             System.out.println("空文件夹"+dir.getName());
57 |         }
58 | 
59 |     }
60 |     public static void fixDir(File dir){
61 |         for(File file:dir.listFiles()){
62 |             try {
63 |                 byte[]imageByte=Files.readAllBytes(file.toPath());
64 |                 int length=imageByte.length;
65 |                 for(int i=imageByte.length-1;i>-1;i--){
66 |                     if(imageByte[i]!=0){
67 |                         length=i+1;
68 |                         break;
69 |                     }
70 |                 }
71 |                 RandomAccessFile randomAccessFile = new RandomAccessFile(file, "rw");
72 |                 randomAccessFile.setLength(length);
73 |                 randomAccessFile.close();
74 |             } catch (IOException e) {
75 |                 e.printStackTrace();
76 |             }
77 |         }
78 |     }
79 | }
80 | 


--------------------------------------------------------------------------------
/src/main/java/cn/chineseall/Node.java:
--------------------------------------------------------------------------------
 1 | package cn.chineseall;
 2 | 
 3 | import java.util.LinkedList;
 4 | import java.util.List;
 5 | 
 6 | /**
 7 |  * Created by padeoe on 2017/4/11.
 8 |  */
 9 | public class Node {
10 |     private String title;
11 |     private int page;
12 |     private List<Node>children=new LinkedList<>();
13 |     public void addChild(Node node){
14 |         children.add(node);
15 |     }
16 | 
17 |     public String getTitle() {
18 |         return title;
19 |     }
20 | 
21 |     public void setTitle(String title) {
22 |         this.title = title;
23 |     }
24 | 
25 |     public int getPage() {
26 |         return page;
27 |     }
28 | 
29 |     public void setPage(int page) {
30 |         this.page = page;
31 |     }
32 | 
33 |     public List<Node> getChildren() {
34 |         return children;
35 |     }
36 | 
37 |     public Node addAll(List<Node>nodes){
38 |         nodes.forEach(node -> children.add(node));
39 |         return this;
40 |     }
41 | }
42 | 


--------------------------------------------------------------------------------
/src/main/java/cn/chineseall/PDFInfo.java:
--------------------------------------------------------------------------------
 1 | package cn.chineseall;
 2 | 
 3 | import com.itextpdf.kernel.pdf.*;
 4 | import com.itextpdf.kernel.pdf.action.PdfAction;
 5 | import com.itextpdf.kernel.pdf.navigation.PdfExplicitDestination;
 6 | 
 7 | import java.io.IOException;
 8 | import java.util.List;
 9 | 
10 | /**
11 |  * Created by padeoe on 2017/4/11.
12 |  */
13 | public class PDFInfo {
14 | 
15 |     public static String getTitle(String src){
16 |         try {
17 |             PdfDocument pdfDoc = new PdfDocument(new PdfReader(src));
18 |             PdfDocumentInfo info = pdfDoc.getDocumentInfo();
19 |             String title=info.getTitle();
20 |             pdfDoc.close();
21 |             return title;
22 |         } catch (Exception e) {
23 |             return null;
24 |         }
25 |     }
26 | 
27 |     public static void addBookMark(Book book,String src,String dest){
28 |         PdfDocument pdfDoc = null;
29 |         try {
30 |             pdfDoc = new PdfDocument(new PdfReader(src), new PdfWriter(dest));
31 |             PdfOutline root = pdfDoc.getOutlines(false);
32 |             PdfDocumentInfo info=pdfDoc.getDocumentInfo();
33 |             info.setTitle(book.getName());
34 | 
35 |             info.setAuthor(CoreService.baseUrl+"/book/"+book.getId());
36 |             List<Node> nodes = book.getOutline();
37 |             addOutline(nodes, root, pdfDoc);
38 |             pdfDoc.close();
39 |         } catch (IOException e) {
40 |             e.printStackTrace();
41 |         }
42 |     }
43 | 
44 | 
45 |     private static void addOutline(List<Node> nodes, PdfOutline root, PdfDocument pdfDocument) {
46 |         for (Node node : nodes) {
47 |             PdfOutline child = root.addOutline(node.getTitle());
48 |             child.addAction(PdfAction.createGoTo(
49 |                     PdfExplicitDestination.createFitH(pdfDocument.getPage(node.getPage()),
50 |                             pdfDocument.getPage(node.getPage()).getPageSize().getTop())));
51 |             addOutline(node.getChildren(), child, pdfDocument);
52 | 
53 |         }
54 |     }
55 | 
56 | }
57 | 


--------------------------------------------------------------------------------
/src/main/java/cn/chineseall/PDFReader.java:
--------------------------------------------------------------------------------
 1 | package cn.chineseall;
 2 | import com.itextpdf.kernel.font.PdfFont;
 3 | import com.itextpdf.kernel.geom.Rectangle;
 4 | import com.itextpdf.kernel.pdf.PdfDocument;
 5 | import com.itextpdf.kernel.pdf.PdfReader;
 6 | import com.itextpdf.kernel.pdf.canvas.parser.EventType;
 7 | import com.itextpdf.kernel.pdf.canvas.parser.PdfCanvasProcessor;
 8 | import com.itextpdf.kernel.pdf.canvas.parser.data.IEventData;
 9 | import com.itextpdf.kernel.pdf.canvas.parser.data.TextRenderInfo;
10 | import com.itextpdf.kernel.pdf.canvas.parser.filter.TextRegionEventFilter;
11 | import com.itextpdf.kernel.pdf.canvas.parser.listener.FilteredEventListener;
12 | import com.itextpdf.kernel.pdf.canvas.parser.listener.LocationTextExtractionStrategy;
13 | import com.itextpdf.kernel.pdf.canvas.parser.listener.SimpleTextExtractionStrategy;
14 | import com.itextpdf.test.annotations.type.SampleTest;
15 | import org.junit.Assert;
16 | import org.junit.BeforeClass;
17 | import org.junit.Test;
18 | import org.junit.experimental.categories.Category;
19 | 
20 | import java.io.File;
21 | import java.io.IOException;
22 | 
23 | @Category(SampleTest.class)
24 | public class PDFReader {
25 | //    public static final String SRC = "C:\\Users\\padeo\\Desktop\\nameddestinations.pdf";
26 | public static final String SRC = "C:\\Users\\padeo\\Desktop\\0081.pdf";
27 | 
28 |     @BeforeClass
29 |     public static void main() throws IOException {
30 | //        PdfReader pdfReader = new PdfReader(file);
31 | //        PdfReaderContentParser parser = new PdfReaderContentParser(pdfReader);
32 | //
33 | //        strategy = parser.processContent(currentPage, new SimpleTextExtractionStrategy());
34 | //        content = strategy.getResultantText();
35 | 
36 |         File file = new File(SRC);
37 |         file.getParentFile().mkdirs();
38 |     }
39 | 
40 |     @Test
41 |     public void manipulatePdf() throws IOException {
42 |         PdfDocument pdfDoc = new PdfDocument(new PdfReader(SRC));
43 |         Rectangle rect = new Rectangle(36, 750, 523, 56);
44 | 
45 |         FontFilter fontFilter = new FontFilter(rect);
46 |         FilteredEventListener listener = new FilteredEventListener();
47 |         LocationTextExtractionStrategy extractionStrategy = listener.attachEventListener(new LocationTextExtractionStrategy(), fontFilter);
48 |         new PdfCanvasProcessor(listener).processPageContent(pdfDoc.getFirstPage());
49 | 
50 |         String actualText = extractionStrategy.getResultantText();
51 |         System.out.println(actualText);
52 | 
53 |         pdfDoc.close();
54 | 
55 | 
56 |     }
57 | 
58 | 
59 |     class FontFilter extends TextRegionEventFilter {
60 |         public FontFilter(Rectangle filterRect) {
61 |             super(filterRect);
62 |         }
63 | 
64 |         @Override
65 |         public boolean accept(IEventData data, EventType type) {
66 |             return true;
67 | //            if (type.equals(EventType.RENDER_TEXT)) {
68 | //                TextRenderInfo renderInfo = (TextRenderInfo) data;
69 | //
70 | //                PdfFont font = renderInfo.getFont();
71 | //                if (null != font) {
72 | //                    String fontName = font.getFontProgram().getFontNames().getFontName();
73 | //                    System.out.println(fontName);
74 | //                    return fontName.equals("FZHTK-GBK1-0200020e4");
75 | //                    //FZHTK-GBK1-0200020e4
76 | //                    //return fontName.endsWith("Bold") || fontName.endsWith("Oblique");
77 | //                }
78 | //            }
79 | //            return false;
80 |         }
81 |     }
82 | }


--------------------------------------------------------------------------------
/src/main/java/cn/chineseall/Tmp.java:
--------------------------------------------------------------------------------
 1 | package cn.chineseall;
 2 | 
 3 | import java.util.Arrays;
 4 | import java.util.List;
 5 | 
 6 | /**
 7 |  * Created by padeo on 2017/8/14.
 8 |  */
 9 | public class Tmp {
10 |     public static void main(String[] args) {
11 |         List<String> strings = Arrays.asList(new String[]{""});
12 |         strings.add("233");
13 |     }
14 | }
15 | 


--------------------------------------------------------------------------------
/src/main/java/cn/chineseall/yus.java:
--------------------------------------------------------------------------------
 1 | package cn.chineseall;
 2 | 
 3 | import java.io.InputStream;
 4 | import java.net.HttpURLConnection;
 5 | import java.net.URL;
 6 | 
 7 | public class yus {
 8 |     public static void main(String[] args) throws Exception {
 9 |         HttpURLConnection connection = (HttpURLConnection) new URL("http://sxqh.chineseall.cn/v3/book/content/VPeZj/pdf/9").openConnection();
10 |         connection.setRequestProperty("Accept", "*/*");
11 |         connection.setRequestProperty("Accept-Encoding", "gzip, deflate");
12 |         connection.setRequestProperty("Accept-Language", "zh-CN,zh;q=0.8");
13 |         connection.setRequestProperty("Cache-Control", "no-cache");
14 |         connection.setRequestProperty("Connection", "keep-alive");
15 |         connection.setRequestProperty("Cookie", "JSESSIONID=6BC691FD580D2AFBCF38F4E9CB60FEC9");
16 |         connection.setRequestProperty("Pragma", "no-cache");
17 |         connection.setRequestProperty("User-Agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.113 Safari/537.36");
18 |         connection.connect();
19 |         String location = connection.getHeaderField("Location");
20 |         String cookie = connection.getHeaderField("Set-Cookie");
21 |         cookie = cookie.substring(0, cookie.indexOf(';'));
22 | 
23 |         connection = (HttpURLConnection) new URL(location).openConnection();
24 |         connection.setRequestProperty("Accept", "*/*");
25 |         connection.setRequestProperty("Accept-Encoding", "gzip, deflate");
26 |         connection.setRequestProperty("Accept-Language", "zh-CN,zh;q=0.8");
27 |         connection.setRequestProperty("Cache-Control", "no-cache");
28 |         connection.setRequestProperty("Connection", "keep-alive");
29 |         connection.setRequestProperty("Cookie", cookie);
30 |         connection.setRequestProperty("Pragma", "no-cache");
31 |         connection.setRequestProperty("User-Agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.113 Safari/537.36");
32 |         try (InputStream is = connection.getInputStream()) {
33 |             utils.network.MyByteArray myByteArray = new utils.network.MyByteArray();
34 |             while (true) {
35 |                 myByteArray.ensureCapacity(4096);
36 |                 int len = is.read(myByteArray.getBuffer(), myByteArray.getOffset(), 4096);
37 |                 if (len == -1) {
38 |                     break;
39 |                 }
40 |                 myByteArray.addOffset(len);
41 |             }
42 |             byte[] bytes = new byte[myByteArray.getSize()];
43 |             System.arraycopy(myByteArray.getBuffer(),0,bytes,0,bytes.length);
44 |             System.out.println(new String(bytes));
45 |             //System.out.println(new String(is.readAllBytes()));
46 |         }
47 |     }
48 | }


--------------------------------------------------------------------------------
/src/main/java/com/njulib/Start.java:
--------------------------------------------------------------------------------
 1 | package com.njulib;
 2 | 
 3 | import com.njulib.object.BookClass;
 4 | 
 5 | import java.io.IOException;
 6 | 
 7 | /**
 8 |  * @author padeoe
 9 |  * @Date: 2016/12/10
10 |  */
11 | public class Start {
12 |     /**
13 |      * 一个使用示例。请修改下面代码的两个文件存储路径，再运行。
14 |      * 当前示例会下载计算机分类下所有书。
15 |      * 下载过程中可以终止程序从而终止下载。下一次下载时会跳过下载分类中已有的书本。
16 |      *
17 |      * @param args
18 |      */
19 |     public static void main(String[] args) {
20 |         //创建一个书目分类，此处定义的是0T0P3010 计算机类，具体解释请参考中图法
21 |         // 格式必须和<a href="http://114.212.7.104:8181/markbook/">南京大学馆藏数字化图书平台</a>一致
22 |         BookClass root = new BookClass("0T0P");
23 |         try {
24 |             System.out.println(root.queryBooksSize());
25 |             /*root.downloadWithCataDir("G:\\", 5, "G:\\未分类\\pageDLFail.txt");*/
26 |         } catch (IOException e) {
27 |             e.printStackTrace();
28 |         }
29 | 
30 | 
31 |     }
32 | }
33 | 


--------------------------------------------------------------------------------
/src/main/java/com/njulib/fix/FileRenamer.java:
--------------------------------------------------------------------------------
  1 | package com.njulib.fix;
  2 | 
  3 | import java.io.File;
  4 | import java.io.FileInputStream;
  5 | import java.io.IOException;
  6 | import java.io.InputStream;
  7 | import java.nio.file.Files;
  8 | import java.nio.file.Path;
  9 | import java.nio.file.Paths;
 10 | import java.util.Arrays;
 11 | 
 12 | /**
 13 |  * 重命名之前版本程序下载的文件。
 14 |  * 之前版本造成了下载的文件命名不合理。
 15 |  *
 16 |  * @author padeoe
 17 |  * @Date: 2016/12/13
 18 |  */
 19 | public class FileRenamer {
 20 |     public static void main(String args[]) {
 21 |         renameZero("G:\\com.njulib.Test\\");
 22 |     }
 23 | 
 24 |     public static void renameZero(String rootDirPath) {
 25 |         Path root = Paths.get(rootDirPath);
 26 |         File rootDir = root.toFile();
 27 |         if (rootDir.isDirectory()) {
 28 |             File dirs[] = rootDir.listFiles();
 29 |             Arrays.asList(dirs).parallelStream().forEach(FileRenamer::handleEndDir);
 30 |         } else {
 31 |             System.out.println("根目录不是目录，终止");
 32 |         }
 33 |     }
 34 | 
 35 |     public static void handleEndDir(File dir) {
 36 |         if (dir.isDirectory()) {
 37 |             System.out.println("正在处理" + dir.getName());
 38 |             File files[] = dir.listFiles();
 39 |             Arrays.asList(files).parallelStream().forEach(file -> rename(dir, file));
 40 |         } else {
 41 |             System.out.println(dir.getName() + "不是目录，跳过");
 42 |         }
 43 |     }
 44 | 
 45 |     private static void rename(File dir, File file) {
 46 |         String name = file.getName();
 47 |         if (name.endsWith("png") || name.endsWith("jpg")) {
 48 |             String prefix = name.substring(0, name.indexOf('.'));
 49 |             name = name.replaceAll(prefix, String.format("%04d", Integer.parseInt(prefix)));
 50 |             try {
 51 |                 Files.move(file.toPath(), new File(dir.getPath() + "\\" + name).toPath());
 52 |             } catch (IOException e) {
 53 |                 System.out.println(file.toString());
 54 |             }
 55 |         }
 56 |     }
 57 | 
 58 |     public static void renameSuffix(String rootDirPath) {
 59 |         Path root = Paths.get(rootDirPath);
 60 |         File rootDir = root.toFile();
 61 |         if (rootDir.isDirectory()) {
 62 |             File dirs[] = rootDir.listFiles();
 63 |             Arrays.asList(dirs).parallelStream().forEach(FileRenamer::imageEndDir);
 64 |         } else {
 65 |             System.out.println("根目录不是目录，终止");
 66 |         }
 67 |     }
 68 | 
 69 |     public static void imageEndDir(File dir) {
 70 |         if (dir.isDirectory()) {
 71 |             System.out.println("正在处理" + dir.getName());
 72 |             File files[] = dir.listFiles();
 73 |             for (File file : files) {
 74 |                 String name = file.getName();
 75 |                 String prefix = name.substring(0, name.indexOf('.'));
 76 |                 String trueSuffix = getImageSuffix(file);
 77 |                 if ((name.endsWith("png") || name.endsWith("jpg")) && trueSuffix != null && !name.endsWith(trueSuffix)) {
 78 |                     name = prefix + "." + trueSuffix;
 79 |                     //  System.out.println("需要修改为"+name);
 80 |                     try {
 81 |                         Files.move(file.toPath(), new File(dir.getPath() + "\\" + name).toPath());
 82 |                     } catch (IOException e) {
 83 |                         System.out.println("修改出错" + file.toString());
 84 |                     }
 85 |                 }
 86 |             }
 87 |         } else {
 88 |             System.out.println(dir.getName() + "不是目录，跳过");
 89 |         }
 90 |     }
 91 | 
 92 | 
 93 |     public static String getImageSuffix(File image) {
 94 |         FileInputStream fileInputStream;
 95 |         InputStream inputStream;
 96 |         try {
 97 |             fileInputStream = new FileInputStream(image);
 98 |             inputStream = fileInputStream;
 99 |             byte[] array = new byte[10];
100 |             inputStream.read(array, 0, 10);
101 |             if (array[6] == 'J' && array[7] == 'F' && array[8] == 'I' && array[9] == 'F') {
102 |                 inputStream.close();
103 |                 return "jpg";
104 |             } else {
105 |                 inputStream.close();
106 |                 return "png";
107 |             }
108 |         } catch (IOException e) {
109 |             e.printStackTrace();
110 |         }
111 |         return null;
112 | 
113 |     }
114 | }
115 | 


--------------------------------------------------------------------------------
/src/main/java/com/njulib/fix/ListBook.java:
--------------------------------------------------------------------------------
  1 | package com.njulib.fix;
  2 | 
  3 | import com.njulib.object.InfoReader;
  4 | import com.njulib.object.Book;
  5 | 
  6 | import java.io.File;
  7 | import java.io.FileWriter;
  8 | import java.io.IOException;
  9 | import java.util.*;
 10 | import java.util.function.Function;
 11 | import java.util.stream.Collectors;
 12 | import java.util.stream.Stream;
 13 | 
 14 | /**
 15 |  * Created by padeoe on 2017/4/25.
 16 |  */
 17 | public class ListBook {
 18 |     private static int id = 1;
 19 |     private static StringBuffer output = new StringBuffer("<html>\n" +
 20 |             "<head><meta charset='UTF-8'></head>" +
 21 |             "<table border=\"1\">\n" +
 22 |             "<tr>\n" +
 23 |             "  <th>id</th>\n" +
 24 |             "  <th>编号</th>\n" +
 25 |             "  <th>书名</th>\n" +
 26 |             "  <th>作者</th>\n" +
 27 |             "  <th>出版年份</th>\n" +
 28 |             "  <th>分类</th>\n" +
 29 |             "</tr>\n");
 30 | 
 31 |     public static void main(String[] args) {
 32 |         getAllBooks(new File(args[0])).forEach(book -> output.append(getBookLineInTable(book)));
 33 |         output.append("</table>\n");
 34 |         output.append("</html>");
 35 |         FileWriter writer = null;
 36 |         try {
 37 |             writer = new FileWriter("out.html", false);
 38 |             writer.write(output.toString());
 39 |             writer.close();
 40 |         } catch (IOException e) {
 41 |             e.printStackTrace();
 42 |         }
 43 | 
 44 |     }
 45 | 
 46 |     public static List<Book> getAllBooks(File rootDir) {
 47 |         List<Book> result = new LinkedList<>();
 48 |         for (File subDir : rootDir.listFiles()) {
 49 |             if (subDir.getName().startsWith("《")) {
 50 |                 File infoFile = subDir.toPath().resolve("info.txt").toFile();
 51 |                 if (infoFile.exists()) {
 52 |                     result.add(new InfoReader(infoFile.getPath()).read());
 53 |                 }
 54 | 
 55 |             } else {
 56 |                 result.addAll(getAllBooks(subDir));
 57 |             }
 58 |         }
 59 |         return result;
 60 |     }
 61 | 
 62 |     public static class BookAndDir {
 63 |         Book book;
 64 |         File Dir;
 65 | 
 66 |         public BookAndDir(Book book, File dir) {
 67 |             this.book = book;
 68 |             Dir = dir;
 69 |         }
 70 | 
 71 |         public Book getBook() {
 72 |             return book;
 73 |         }
 74 | 
 75 |         public void setBook(Book book) {
 76 |             this.book = book;
 77 |         }
 78 | 
 79 |         public File getDir() {
 80 |             return Dir;
 81 |         }
 82 | 
 83 |         public void setDir(File dir) {
 84 |             Dir = dir;
 85 |         }
 86 |     }
 87 | 
 88 |     /**
 89 |      * 获取目录下所有书籍
 90 |      *
 91 |      * @param rootDir
 92 |      * @return
 93 |      */
 94 |     public static Stream<BookAndDir> getAllBooksAndDir(File rootDir) {
 95 |         Stream<File> inputFileStream = Arrays.stream(rootDir.listFiles());
 96 |         return inputFileStream.flatMap(subDir -> {
 97 |             if (subDir.getName().startsWith("《")) {
 98 |                 File infoFile = subDir.toPath().resolve("info.txt").toFile();
 99 |                 if (infoFile.exists()) {
100 |                     return Arrays.stream(new BookAndDir[]{new BookAndDir(new InfoReader(infoFile.getPath()).read(), subDir)});
101 |                 }
102 |                 return null;
103 |             } else {
104 |                 return getAllBooksAndDir(subDir);
105 |             }
106 |         }).filter(bookAndDir -> bookAndDir.getBook() != null);
107 |     }
108 | 
109 |     private static String getBookLineInTable(Book book) {
110 |         if (book != null) {
111 |             StringBuffer stringBuffer = new StringBuffer();
112 |             stringBuffer.append("<tr>\n");
113 |             stringBuffer.append(getLine(id + ""));
114 |             id++;
115 |             stringBuffer.append(getAttr(Book::getId, book));
116 |             stringBuffer.append(getAttr(Book::getName, book));
117 |             stringBuffer.append(getAttr(Book::getAuthor, book));
118 |             stringBuffer.append(getAttr(Book::getPublishDate, book));
119 |             stringBuffer.append(getAttr(Book::getDetailBookClass, book));
120 |             stringBuffer.append("</tr>");
121 |             return stringBuffer.toString();
122 |         } else {
123 |             return null;
124 |         }
125 | 
126 |     }
127 | 
128 |     private static String getAttr(Function<Book, String> attrGetter, Book book) {
129 |         return getLine(attrGetter.apply(book));
130 |     }
131 | 
132 |     private static String getLine(String content) {
133 |         StringBuffer stringBuffer = new StringBuffer();
134 |         stringBuffer.append("<td>");
135 |         stringBuffer.append(content);
136 |         stringBuffer.append("</td>\n");
137 |         return stringBuffer.toString();
138 |     }
139 | }
140 | 


--------------------------------------------------------------------------------
/src/main/java/com/njulib/fix/MissingPageCompletion.java:
--------------------------------------------------------------------------------
  1 | package com.njulib.fix;
  2 | 
  3 | import com.njulib.spider.BookDownloader;
  4 | 
  5 | import java.io.FileWriter;
  6 | import java.io.IOException;
  7 | import java.nio.file.Files;
  8 | import java.nio.file.Paths;
  9 | import java.util.Iterator;
 10 | import java.util.List;
 11 | import java.util.regex.Matcher;
 12 | import java.util.regex.Pattern;
 13 | 
 14 | /**
 15 |  * 读取下载日志中的错误，进行缺页补全。
 16 |  *
 17 |  * @author padeoe
 18 |  *         Date: 2016/12/09
 19 |  */
 20 | public class MissingPageCompletion {
 21 |     private String logLocation = Paths.get(System.getProperty("user.dir"), BookDownloader.ERROR_LOG_NAME).toString();
 22 |     private Pattern pattern = Pattern.compile("PageDLException\\{url='(.*)', location='(.*)'\\}");
 23 | 
 24 |     /**
 25 |      * 创建一个{@code MissingPageCompletion}对象并将日志路径指定为{@code logLocation}
 26 |      *
 27 |      * @param logLocation 日志文件路径
 28 |      */
 29 |     public MissingPageCompletion(String logLocation) {
 30 |         this.logLocation = logLocation;
 31 |     }
 32 | 
 33 |     /**
 34 |      * 读取日志中所有下载失败的单页信息并重新下载一次。
 35 |      * 重新下载的日志会输入到原日志文件中
 36 |      */
 37 |     public void complete() {
 38 |         try {
 39 |             List<String> lines = Files.readAllLines(Paths.get(logLocation));
 40 |             Iterator<String> iterator = lines.iterator();
 41 |             while (iterator.hasNext()) {
 42 |                 String line = iterator.next();
 43 |                 Matcher matcher = pattern.matcher(line);
 44 |                 String url, location;
 45 |                 if (matcher.find()) {
 46 |                     url = matcher.group(1);
 47 |                     location = matcher.group(2);
 48 |                     System.out.println(url + " " + location);
 49 |                     try {
 50 |                         BookDownloader.downloadImage(url, location);
 51 |                         iterator.remove();
 52 |                     } catch (IOException downloadFail) {
 53 |                     }
 54 |                 }
 55 |             }
 56 | 
 57 |             StringBuilder newLog = new StringBuilder();
 58 |             lines.forEach(line -> newLog.append(line).append(System.getProperty("line.separator")));
 59 |             FileWriter writer = new FileWriter(logLocation, false);
 60 |             writer.write(newLog.toString());
 61 |             writer.close();
 62 |         } catch (IOException e) {
 63 |             e.printStackTrace();
 64 |         }
 65 |     }
 66 | 
 67 |     /**
 68 |      * 获取当前指定的日志的位置。
 69 |      * 如果没有指定位置，将默认使用当前路径下的名为{@link BookDownloader#ERROR_LOG_NAME}的文件
 70 |      *
 71 |      * @return 当前指定的日志的位置
 72 |      */
 73 |     public String getLogLocation() {
 74 |         return logLocation;
 75 |     }
 76 | 
 77 |     /**
 78 |      * 指定输入的日志的位置
 79 |      *
 80 |      * @param logLocation 作为输入的日志的位置
 81 |      */
 82 |     public void setLogLocation(String logLocation) {
 83 |         this.logLocation = logLocation;
 84 |     }
 85 | 
 86 |     /**
 87 |      * 获取当前指定的错误日志的单行格式
 88 |      *
 89 |      * @return 错误日志的单行格式
 90 |      */
 91 |     public Pattern getPattern() {
 92 |         return pattern;
 93 |     }
 94 | 
 95 |     /**
 96 |      * 设置日志的单行格式
 97 |      *
 98 |      * @param pattern 日志的单行格式
 99 |      */
100 |     public void setPattern(Pattern pattern) {
101 |         this.pattern = pattern;
102 |     }
103 | }
104 | 


--------------------------------------------------------------------------------
/src/main/java/com/njulib/object/Book.java:
--------------------------------------------------------------------------------
  1 | package com.njulib.object;
  2 | 
  3 | import com.njulib.object.exception.BookDLException;
  4 | import org.jsoup.Jsoup;
  5 | import org.jsoup.nodes.Document;
  6 | import org.jsoup.nodes.Element;
  7 | import org.jsoup.select.Elements;
  8 | import com.njulib.spider.BookDownloader;
  9 | import com.njulib.spider.NJULib;
 10 | import utils.network.MyHttpRequest;
 11 | 
 12 | import java.io.IOException;
 13 | import java.net.URLDecoder;
 14 | 
 15 | /**
 16 |  * 图书。
 17 |  * <p>
 18 |  * 对应<a href="http://114.212.7.104:8181/markbook/">南京大学馆藏数字化图书平台</a> 中的图书。
 19 |  *
 20 |  * @author padeoe
 21 |  * @Date: 2016/12/08
 22 |  */
 23 | public class Book {
 24 |     /**
 25 |      * 书的id,唯一识别号，是由<a href="http://114.212.7.104:8181/markbook/">南京大学馆藏数字化图书平台</a>定义的
 26 |      */
 27 |     private String id;
 28 |     /**
 29 |      * 书名，应该总是包含书名号《》
 30 |      */
 31 |     private String name;
 32 |     private String author;
 33 | 
 34 |     /**
 35 |      * 初始化一个新创建的{@code Book}对象。
 36 |      * <p>
 37 |      * 如果你没有足够的参数信息调用该方法创建对象,可调用{@link #getBookFromUrl(String)}通过书本的在线阅读地址获取实例，
 38 |      * 或者使用{@link com.njulib.spider.BookSearch}中的方法根据书名等字段查询并创建满足条件的的图书实例。
 39 |      *
 40 |      * @param id 书本id，需要和<a href="http://114.212.7.104:8181/markbook/">南京大学馆藏数字化图书平台</a>服务器一致
 41 |      */
 42 |     public Book(String id) {
 43 |         this.id = id;
 44 |     }
 45 | 
 46 |     /**
 47 |      * 获取书本的编号
 48 |      *
 49 |      * @return 书本编号
 50 |      */
 51 |     public String getId() {
 52 |         return id;
 53 |     }
 54 | 
 55 |     /**
 56 |      * 设置书本编号
 57 |      *
 58 |      * @param id 书本编号
 59 |      */
 60 |     public void setId(String id) {
 61 |         this.id = id;
 62 |     }
 63 | 
 64 |     /**
 65 |      * 获取书本名
 66 |      *
 67 |      * @return 书名，包含书名号《》
 68 |      */
 69 |     public String getName() {
 70 |         return name;
 71 |     }
 72 | 
 73 |     /**
 74 |      * 设置书名
 75 |      *
 76 |      * @param name 书名
 77 |      */
 78 |     public void setName(String name) {
 79 |         this.name = name;
 80 |     }
 81 | 
 82 |     /**
 83 |      * 获取书本作者，可能是null
 84 |      *
 85 |      * @return 书本作者
 86 |      */
 87 |     public String getAuthor() {
 88 |         return author;
 89 |     }
 90 | 
 91 |     /**
 92 |      * 设置书本作者
 93 |      *
 94 |      * @param author 书本作者
 95 |      */
 96 |     public void setAuthor(String author) {
 97 |         this.author = author;
 98 |     }
 99 | 
100 |     /**
101 |      * 获取书本出版日期
102 |      *
103 |      * @return 书本出版日期
104 |      */
105 |     public String getPublishDate() {
106 |         return publishDate;
107 |     }
108 | 
109 |     public void setPublishDate(String publishDate) {
110 |         this.publishDate = publishDate;
111 |     }
112 | 
113 |     /**
114 |      * 获取书本主题词，可能是null
115 |      *
116 |      * @return 书本主题词
117 |      */
118 |     public String getTheme() {
119 |         return theme;
120 |     }
121 | 
122 |     public void setTheme(String theme) {
123 |         this.theme = theme;
124 |     }
125 | 
126 |     /**
127 |      * 获取书本所在分类
128 |      *
129 |      * @return 书本所在分类
130 |      */
131 |     public BookClass getBookClass() {
132 |         return bookClass;
133 |     }
134 | 
135 |     public void setBookClass(BookClass bookClass) {
136 |         this.bookClass = bookClass;
137 |     }
138 | 
139 |     /**
140 |      * 获取书本所在末级分类
141 |      *
142 |      * @return 字符串描述所属分类，最末层的分类，用&gt;分割层级，
143 |      * 例如“数理科学和化学图书馆&gt;数学&gt;总论复分&gt;总论”
144 |      */
145 |     public String getDetailBookClass() {
146 |         return detailBookClass;
147 |     }
148 | 
149 |     public void setDetailBookClass(String detailBookClass) {
150 |         this.detailBookClass = detailBookClass;
151 |     }
152 | 
153 |     private String publishDate;
154 |     private String theme;
155 |     /**
156 |      * 所属分类
157 |      */
158 |     private BookClass bookClass = new RootBookClass();
159 |     /**
160 |      * 所属分类的中文描述。
161 |      * “>”分割层级，
162 |      * 例如“数理科学和化学图书馆>数学>总论复分>总论”
163 |      */
164 |     private String detailBookClass;
165 | 
166 |     public String getCookie() {
167 |         return cookie;
168 |     }
169 | 
170 |     void setCookie(String cookie) {
171 |         this.cookie = cookie;
172 |     }
173 | 
174 |     private String cookie;
175 | 
176 |     /**
177 |      * 初始化一个新创建的{@code Book}对象。需要{@code Book}的所有属性。
178 |      * 如果你没有足够的参数信息调用该方法创建对象,可调用{@link #getBookFromUrl(String)}通过书本的在线阅读地址获取实例，
179 |      * 或者使用{@link com.njulib.spider.BookSearch}中的方法根据书名等字段查询并创建满足条件的的图书实例。
180 |      *
181 |      * @param id              {@code Book}的id。该id是服务器命名的
182 |      * @param name            书名
183 |      * @param author          作者
184 |      * @param publishDate     出版日期
185 |      * @param theme           主题词
186 |      * @param bookClass       书本分类
187 |      * @param detailBookClass 书本分类分类名路径
188 |      */
189 |     public Book(String id, String name, String author, String publishDate, String theme, BookClass bookClass, String detailBookClass) {
190 |         this.id = id;
191 |         this.name = name;
192 |         this.author = author;
193 |         this.publishDate = publishDate;
194 |         this.theme = theme;
195 |         this.bookClass = bookClass;
196 |         this.detailBookClass = detailBookClass;
197 |     }
198 | 
199 | 
200 |     /**
201 |      * 通过在线阅览的地址来获取{@code Book}对象
202 |      *
203 |      * @param onlineReadUrl 书本的在线阅读地址
204 |      * @return Book对象，仅指定了id
205 |      */
206 |     public static Book getBookFromUrl(String onlineReadUrl) {
207 |         for (String para : onlineReadUrl.split("&")) {
208 |             if (para.startsWith("ssnumber=")) {
209 |                 Book book = new Book(para.substring(9, para.length()));
210 |                 book.fillBookInfoByUrl(onlineReadUrl);
211 |                 return book;
212 |             }
213 |         }
214 |         return null;
215 |     }
216 | 
217 |     /**
218 |      * 通过在线阅读页面补全{@code Book}的信息
219 |      * 仅可补全{@link #name},{@link #id},{@link #author},{@link  #publishDate}
220 |      *
221 |      * @param url 书本的在线阅读页面
222 |      */
223 |     public void fillBookInfoByUrl(String url) {
224 |         try {
225 |             String html = new BookDownloader(this).getBookViewPageHtml(url);
226 |             html = html.replaceAll("<!--", "<");
227 |             html = html.replaceAll("-->", "");
228 |             Document doc = Jsoup.parse(html);
229 |             Elements nameNode = doc.getElementsByTag("title");
230 |             this.name = nameNode.text();
231 |             Elements infoNode = doc.getElementsByTag("span").not("[style]");
232 |             for (Element node : infoNode) {
233 |                 if (node.text().startsWith("作者：")) {
234 |                     this.author = node.text().substring(3, node.text().length());
235 |                 }
236 |                 if (node.text().startsWith("出版日期：")) {
237 |                     this.publishDate = node.text().substring(5, node.text().length());
238 |                 }
239 |             }
240 |         } catch (BookDLException e) {
241 |             e.printStackTrace();
242 |         }
243 |     }
244 | 
245 |     /**
246 |      * 获取书本的在线阅读地址。
247 |      *
248 |      * @return 书本在线与阅读的URL
249 |      * @throws IOException IO错误
250 |      */
251 |     public String getbookread() throws IOException {
252 |         resetCookie();
253 |         String para = "BID=" + id + "&ReadMode=0&pdfread=0&displaystyle=0";
254 |         String Url = NJULib.baseUrl + "/getbookread?" + para;
255 |         String result = MyHttpRequest.getWithCookie(Url, null, cookie, "UTF-8", 1000);
256 |         return NJULib.baseUrl + URLDecoder.decode(result, "UTF-8");
257 |     }
258 | 
259 |     /**
260 |      * 重置{@link #cookie}
261 |      *
262 |      * @throws IOException 重置cookie失败
263 |      */
264 |     private void resetCookie() throws IOException {
265 |         cookie = (cookie == null) ? NJULib.getSession() : cookie;
266 |     }
267 | 
268 |     @Override
269 |     public String toString() {
270 |         return "Book{" +
271 |                 "id='" + id + '\'' +
272 |                 ", name='" + name + '\'' +
273 |                 ", author='" + author + '\'' +
274 |                 ", publishDate='" + publishDate + '\'' +
275 |                 ", theme='" + theme + '\'' +
276 |                 ", bookClass='" + bookClass.getPath() + '\'' +
277 |                 ", detailBookClass='" + detailBookClass + '\'' +
278 |                 '}';
279 |     }
280 | 
281 |     /**
282 |      * 下载该书。将下载许多图片，书的每一页都是一张png图片。
283 |      * 将会在{@code pathname}下创建一个以书名命名的文件夹，并存储所有图片。
284 |      * 错误日志将在当前路径下名为"error.log"
285 |      */
286 |     public void download() {
287 |         BookDownloader bookDownloader = new BookDownloader(this);
288 |         bookDownloader.downloadAllImages();
289 |     }
290 | 
291 |     /**
292 |      * 下载该书。将下载许多图片，书的每一页都是一张png图片。
293 |      * 将会在{@code pathname}下创建一个以书名命名的文件夹，并存储所有图片。
294 |      * 错误日志将在当前路径下名为"error.log"
295 |      *
296 |      * @param pathname     下载存储目录
297 |      * @param threadNumber 下载线程数
298 |      */
299 |     public void download(String pathname, int threadNumber) {
300 |         BookDownloader bookDownloader = new BookDownloader(this);
301 |         bookDownloader.setSavePath(pathname);
302 |         bookDownloader.setThreadNumber(threadNumber);
303 |         bookDownloader.downloadAllImages();
304 |     }
305 | 
306 |     /**
307 |      * 下载该书。将下载许多图片，书的每一页都是一张png图片。
308 |      * 将会在{@code pathname}下创建一个以书名命名的文件夹，并存储所有图片。
309 |      *
310 |      * @param pathname     下载存储目录
311 |      * @param threadNumber 线程数
312 |      * @param errorLogPath 错误日志路径
313 |      */
314 |     public void download(String pathname, int threadNumber, String errorLogPath) {
315 |         BookDownloader bookDownloader = new BookDownloader(this);
316 |         bookDownloader.setSavePath(pathname);
317 |         bookDownloader.setThreadNumber(threadNumber);
318 |         bookDownloader.setErrorLogPath(errorLogPath);
319 |         bookDownloader.downloadAllImages();
320 |     }
321 | 
322 |     @Override
323 |     public int hashCode() {
324 |         return Integer.parseInt(this.getId());
325 |     }
326 | 
327 |     @Override
328 |     public boolean equals(Object obj) {
329 |         if (!(obj instanceof Book))
330 |             return false;
331 |         if (obj == this)
332 |             return true;
333 |         return this.id.equals(((Book) obj).id);
334 |     }
335 | }
336 | 


--------------------------------------------------------------------------------
/src/main/java/com/njulib/object/BookClass.java:
--------------------------------------------------------------------------------
  1 | package com.njulib.object;
  2 | 
  3 | import org.jsoup.Jsoup;
  4 | import org.jsoup.nodes.Document;
  5 | import org.jsoup.nodes.Element;
  6 | import org.jsoup.select.Elements;
  7 | import com.njulib.spider.BookDownloader;
  8 | import com.njulib.spider.NJULib;
  9 | import utils.conversion.MyDecoder;
 10 | import utils.network.MyHttpRequest;
 11 | 
 12 | import java.io.IOException;
 13 | import java.nio.file.Paths;
 14 | import java.util.*;
 15 | import java.util.concurrent.atomic.AtomicInteger;
 16 | import java.util.regex.Matcher;
 17 | import java.util.regex.Pattern;
 18 | import java.util.stream.Collectors;
 19 | 
 20 | /**
 21 |  * 图书的分类。
 22 |  * <p>
 23 |  * 对应<a href="http://114.212.7.104:8181/markbook/">南京大学馆藏数字化图书平台</a> 中的图书。
 24 |  * 同时分类名和分类编号满足中图法分类。是树结构。具有查询子分类和查询分类下书籍列表，批量下载分类书籍等功能。
 25 |  * 如果你没有足够信息构造实例，可以通过{@link RootBookClass}查询所有分类来获取实例。
 26 |  * 或者{@link com.njulib.spider.BookSearch}中的一些方法获取实例。
 27 |  *
 28 |  * @author padeoe
 29 |  * @Date: 2016/12/08
 30 |  */
 31 | public class BookClass {
 32 |     /**
 33 |      * 分类id，服务器定义的中图法分类id，
 34 |      * 例如"0T0P3010"
 35 |      */
 36 |     private String id;
 37 |     /**
 38 |      * 分类名称
 39 |      */
 40 |     private String name;
 41 |     /**
 42 |      * 父分类
 43 |      */
 44 |     private BookClass parent;
 45 |     /**
 46 |      * 子分类列表
 47 |      */
 48 |     private Map<String, BookClass> children;
 49 | 
 50 | 
 51 |     /**
 52 |      * 子分类{@link #children}是否已经被加载
 53 |      */
 54 |     private boolean isLoaded = false;
 55 | 
 56 |     /**
 57 |      * 查看当对象所使用的cookie
 58 |      *
 59 |      * @return cookie
 60 |      */
 61 |     public String getCookie() {
 62 |         return cookie;
 63 |     }
 64 | 
 65 |     /**
 66 |      * 设置{@code cookie},BookClass的子每一次子分类加载，
 67 |      * 书籍查询等操作都需要cookie，设置的cookie将会对所有子分类使用，
 68 |      * 以避免频繁获取cookie
 69 |      *
 70 |      * @param cookie cookie
 71 |      */
 72 |     public void setCookie(String cookie) {
 73 |         this.cookie = cookie;
 74 |     }
 75 | 
 76 |     /**
 77 |      * 查询分类信息时向服务器发送的cookie，初始=null。
 78 |      * 当调用了需要网络的方法时，将会被初始化。
 79 |      * 一个{@link BookClass}对象的所有子分类{@link #children}都是用的同一个cookie
 80 |      */
 81 |     private String cookie;
 82 | 
 83 |     /**
 84 |      * 获取子分类的数量
 85 |      *
 86 |      * @return 子分类的数量
 87 |      */
 88 |     public int getChildCount() {
 89 |         return children.size();
 90 |     }
 91 | 
 92 |     /**
 93 |      * 获取父分类
 94 |      *
 95 |      * @return 父分类。如果不存在则为null
 96 |      */
 97 |     public BookClass getParent() {
 98 |         return parent;
 99 |     }
100 | 
101 | 
102 |     /**
103 |      * 获取所有子分类。
104 |      * 初始为null,若要查看子分类，必须先调用{@link #loadChild()}或者{@link #loadAllChild()}从服务器查询并加载
105 |      *
106 |      * @return 子分类的集合
107 |      */
108 |     public Set<BookClass> getChildren() {
109 |         return children.values().stream().collect(Collectors.toSet());
110 |     }
111 | 
112 |     /**
113 |      * 查询特定子分类。
114 |      *
115 |      * @param idOrName 子分类的名称或者代号。符合中图法分类。
116 |      * @return 子分类
117 |      */
118 |     public BookClass getChild(String idOrName) {
119 |         return children.get(idOrName);
120 |     }
121 | 
122 |     public String getName() {
123 |         return name;
124 |     }
125 | 
126 |     public void setName(String name) {
127 |         this.name = name;
128 |     }
129 | 
130 |     public String getId() {
131 |         return id;
132 |     }
133 | 
134 |     public void setId(String id) {
135 |         this.id = id;
136 |     }
137 | 
138 |     public void setParent(BookClass parent) {
139 |         this.parent = parent;
140 |     }
141 | 
142 |     public boolean isTerminal() {
143 |         return false;
144 |     }
145 | 
146 |     /**
147 |      * 添加一个子分类
148 |      *
149 |      * @param bookClass 子分类
150 |      * @return 如果同id的子分类已存在，则返回之前的子分类，如果不存在，则添加并返回null
151 |      */
152 |     public BookClass addChild(BookClass bookClass) {
153 |         if (bookClass.name != null) {
154 |             children.putIfAbsent(bookClass.name, bookClass);
155 |         }
156 |         return children.putIfAbsent(bookClass.id, bookClass);
157 |     }
158 | 
159 |     /**
160 |      * 创建并初始化一个书本分类。指定分类编号，分类名称和父分类。
161 |      *
162 |      * @param id     分类编号
163 |      * @param name   分类名称
164 |      * @param parent 父分类
165 |      */
166 |     public BookClass(String id, String name, BookClass parent) {
167 |         this.id = id;
168 |         this.name = name;
169 |         this.parent = parent;
170 |         children = new HashMap<>();
171 |     }
172 | 
173 |     /**
174 |      * 创建一个新初始化的{@code BookClass}对象，
175 |      * 使之中图法分类标识是{@code id}
176 |      *
177 |      * @param id 分类的中图法分类标识。
178 |      *           需要和<a href="http://114.212.7.104:8181/markbook/">南京大学馆藏数字化图书平台</a>定义的格式一致
179 |      */
180 |     public BookClass(String id) {
181 |         this.id = id;
182 |         children = new HashMap<>();
183 |         this.isLoaded = false;
184 |     }
185 | 
186 |     /**
187 |      * 加载子分类。仅加载一层子分类，即子分类的子分类不会被加载。
188 |      * 当该方法被调用时，会向服务器查询该分类的子分类并更新该对象的{@link #children}
189 |      * <p>
190 |      * 如需递归加载子分类，调用{@link #loadAllChild()}
191 |      *
192 |      * @throws IOException 从服务器查询子节点出错
193 |      */
194 |     public void loadChild() throws IOException {
195 |         if (!isTerminal()) {
196 |             checkCookie();
197 |             String Url = NJULib.baseUrl + "/classifyview";
198 |             String data = "fenlei=" + this.getId() + "&lib=markbook";
199 |             String result = MyHttpRequest.postWithCookie(data, Url, null, cookie, "UTF-8", "UTF-8", 1000);
200 |             // System.out.println(result);
201 |             Document doc = Jsoup.parse(result);
202 |             Elements li = doc.getElementsByTag("li");
203 |             for (Element bookClassId : li) {
204 |                 String id = bookClassId.attr("id");
205 |                 String name = bookClassId.getElementsByTag("a").text();
206 |                 boolean hasSubTree = bookClassId.getElementsByTag("img").attr("onClick").contains("getSubTree");
207 |                 //System.out.println(id+" "+NJULib.decodeUrlUnicode(name));
208 |                 BookClass child = hasSubTree ? new BookClass(id, MyDecoder.decodeUrlUnicode(name), this) :
209 |                         new TerminalBookClass(id, MyDecoder.decodeUrlUnicode(name), this);
210 |                 child.setCookie(cookie);
211 |                 this.addChild(child);
212 |             }
213 |             this.isLoaded = true;
214 |         }
215 |     }
216 | 
217 | 
218 |     /**
219 |      * 迭代加载所有子分类。
220 |      * 直至加载到每个分类的末层分类。
221 |      *
222 |      * @throws IOException 从服务器查询时出错
223 |      */
224 |     public void loadAllChild() throws IOException {
225 |         if (!isTerminal()) {
226 |             loadChild();
227 |             for (BookClass child : getChildren()) {
228 |                 child.loadAllChild();
229 |             }
230 |         }
231 |     }
232 | 
233 | 
234 |     /**
235 |      * 下载分类下所有图书，会迭代测创建分类文件夹
236 |      *
237 |      * @param pathname     存储路径。将在该路径下创建多级分类目录并保存下载的图书
238 |      * @param threadNumber 线程数
239 |      * @param errorLogPath 错误日志路径
240 |      * @throws IOException 连接失败的错误
241 |      */
242 |     public void downloadWithCataDir(String pathname, int threadNumber, String errorLogPath) throws IOException {
243 |         if (!isTerminal()) {
244 |             loadChild();
245 |             for (BookClass child : getChildren()) {
246 |                 child.downloadWithCataDir(Paths.get(pathname, name == null ? id : name).toString(), threadNumber, errorLogPath);
247 |             }
248 |         } else {
249 |             downloadAllBooks(Paths.get(pathname, name == null ? id : name).toString(), threadNumber, errorLogPath);
250 |         }
251 |     }
252 | 
253 |     /**
254 |      * 下载分类下所有图书，会迭代测创建分类文件夹
255 |      * 下载存储路径为当前路径，线程数为5，错误日志将保存在当前路径，文件名为{@link BookDownloader#ERROR_LOG_NAME}
256 |      * 可以调用重载{@link #downloadWithCataDir(String, int, String)}设置参数
257 |      *
258 |      * @throws IOException 连接失败的错误
259 |      */
260 |     public void downloadWithCataDir() throws IOException {
261 |         downloadWithCataDir(System.getProperty("user.dir"), 5, Paths.get(System.getProperty("user.dir"), BookDownloader.ERROR_LOG_NAME).toString());
262 |     }
263 | 
264 |     /**
265 |      * 从服务器获取该分类下图书列表的第{@code page}页。
266 |      * 图书列表的分页是服务器做的，每页最多10条图书。
267 |      * <p>
268 |      * 页数的最大值可以根据{@link #queryBooksSize()}自行计算
269 |      *
270 |      * @param page 图书列表的页码
271 |      * @return 列表该页记录的图书
272 |      * @throws IOException 从服务器查询书本列表时出错
273 |      */
274 |     public Set<Book> queryBooks(int page) throws IOException {
275 |         checkCookie();
276 |         String data = "fenlei=" + this.id + "&mark=all&Page=" + page + "&totalnumber=-1";
277 |         String Url = NJULib.baseUrl + "/markbook/classifybookview.jsp";
278 |         String html = MyHttpRequest.postWithCookie(data, Url, null, cookie, "UTF-8", "GBK", 1000);
279 |         //   System.out.println(html);
280 |         Set<Book> books = queryBooks(html);
281 |         return books;
282 | 
283 |     }
284 | 
285 |     /**
286 |      * 获得某分类下的所有图书
287 |      *
288 |      * @return 分类下所有图书
289 |      * @throws IOException 从服务器查询书本列表时出错
290 |      */
291 |     public Set<Book> queryAllBooks() throws IOException {
292 |         return queryAllBooks(5);
293 |     }
294 | 
295 |     /**
296 |      * 获得分类下的所有图书
297 |      *
298 |      * @param threadNumber 线程数
299 |      * @return 图书集合
300 |      * @throws IOException 连接错误
301 |      */
302 |     public Set<Book> queryAllBooks(int threadNumber) throws IOException {
303 |         checkCookie();
304 |         String data = "fenlei=" + this.id + "&mark=all&Page=1&totalnumber=-1";
305 |         String Url = NJULib.baseUrl + "/markbook/classifybookview.jsp";
306 |         String html = MyHttpRequest.postWithCookie(data, Url, null, cookie, "UTF-8", "GBK", 1000);
307 |         //   System.out.println(html);
308 |         Document doc = Jsoup.parse(html);
309 |         Elements form = doc.select("a:contains(末页)");
310 | 
311 |         if (!form.isEmpty()) {
312 |             String keyword = form.get(0).attr("href");
313 |             String booksize = keyword.substring(keyword.lastIndexOf(",") + 1, keyword.length() - 1);
314 |             int size = Integer.parseInt(booksize);
315 |             System.out.println("一共 " + size + " 本书");
316 |             Set<Book> books = queryBooks(html);
317 |             List<PageGetThread> threadList = new ArrayList<>();
318 | 
319 |             AtomicInteger needGettedPage = new AtomicInteger(2);//需要获取的页码
320 |             int lastPage = size / 10 + 1;//最后一页的页码
321 |             //开始多线程刷所有页码
322 |             for (int threadN = 0; threadN < threadNumber; threadN++) {
323 |                 threadList.add(new PageGetThread(needGettedPage, lastPage));
324 |             }
325 | 
326 |             for (PageGetThread thread : threadList) {
327 |                 thread.start();
328 |             }
329 |             for (PageGetThread thread : threadList) {
330 |                 try {
331 |                     thread.join();
332 |                 } catch (InterruptedException e) {
333 |                     e.printStackTrace();
334 |                 }
335 |             }
336 |             threadList.forEach(pageGetThread -> books.addAll(pageGetThread.getThreadBooks()));
337 |             return books;
338 |         }
339 |         return null;
340 |     }
341 | 
342 |     /**
343 |      * 下载分类下所有图书。
344 |      * 所有书籍将直接保存在{@code pathname}目录下，每本书一个文件夹，以书名命名。如同名，则加作者名，如又同名，加书本编号
345 |      *
346 |      * @param pathname     存储路径。书本文件夹所在的上级路径
347 |      * @param threadNumber 线程数
348 |      * @param errorLogPath 错误日志路径
349 |      * @throws IOException 连接失败的错误
350 |      */
351 |     public void downloadAllBooks(String pathname, int threadNumber, String errorLogPath) throws IOException {
352 |         checkCookie();
353 |         String data = "fenlei=" + this.id + "&mark=all&Page=1&totalnumber=-1";
354 |         String Url = NJULib.baseUrl + "/markbook/classifybookview.jsp";
355 |         String html = MyHttpRequest.postWithCookie(data, Url, null, cookie, "UTF-8", "GBK", 1000);
356 |         //   System.out.println(html);
357 |         Document doc = Jsoup.parse(html);
358 |         Elements form = doc.select("a:contains(末页)");
359 |         if (!form.isEmpty()) {
360 |             String keyword = form.get(0).attr("href");
361 |             String booksize = keyword.substring(keyword.lastIndexOf(",") + 1, keyword.length() - 1);
362 |             int size = Integer.parseInt(booksize);
363 |             System.out.println(this.getPath()+"一共 " + size + " 本书");
364 |             Set<Book> books = queryBooks(html);
365 |             Set<Book> downloading;
366 |             downloadBooks(books, pathname, threadNumber, errorLogPath);
367 |             int lastPage = size / 10 + 1;//最后一页的页码
368 |             int index = 1;
369 |             for (int i = lastPage; i >= 2; i--) {
370 |                 downloading = queryBooks(i);
371 |                 for (Book book : downloading) {
372 |                     if (books.add(book)) {
373 |                         book.download(pathname, threadNumber, errorLogPath);
374 |                         index++;
375 |                     } else {
376 |                         System.out.println("服务器返回了重复书籍，跳过 " + book);
377 |                     }
378 |                 }
379 |             }
380 |             System.out.println("去重后共" + books.size() + "书，实际下载了" + (index + 10) + "本书(含失败)");
381 |         }
382 |     }
383 | 
384 |     private void downloadBooks(Set<Book> books, String pathname, int threadNumber, String errorLogPath) {
385 |         for (Book book : books) {
386 |             book.download(pathname, threadNumber, errorLogPath);
387 |         }
388 |     }
389 | 
390 | 
391 |     /**
392 |      * 获取所有图书列表的线程
393 |      */
394 |     class PageGetThread extends Thread {
395 |         Set<Book> books = new HashSet<>();
396 |         AtomicInteger needGettedPage;
397 |         int lastPage;
398 | 
399 |         public PageGetThread(AtomicInteger needGettedPage, int lastPage) {
400 |             this.needGettedPage = needGettedPage;
401 |             this.lastPage = lastPage;
402 |         }
403 | 
404 |         @Override
405 |         public void run() {
406 |             while (true) {
407 |                 int gettingpage = needGettedPage.getAndIncrement();
408 |                 if (gettingpage <= lastPage) {
409 |                     try {
410 |                         if (gettingpage % 10 == 0) {
411 |                             resetCookie();
412 |                         }
413 |                         books.addAll(queryBooks(gettingpage));
414 |                     } catch (IOException e) {
415 |                         e.printStackTrace();
416 |                     }
417 |                 } else {
418 |                     break;
419 |                 }
420 |             }
421 |         }
422 | 
423 |         public Set<Book> getThreadBooks() {
424 |             return books;
425 |         }
426 |     }
427 | 
428 | 
429 |     /**
430 |      * 获取HTML文本中的书籍并根据其分类添加进当前的分类结构
431 |      *
432 |      * @param html 服务器特定页面返回的包含书本信息的HTML文本。
433 |      *             服务器多个不同页面返回的包含书本信息的HTML中书本信息相关节点的格式都相似。均可调用本函数
434 |      * @return HTML中记录的书本
435 |      */
436 |     public Set<Book> queryBooks(String html) {
437 |         Document doc = Jsoup.parse(html);
438 |         Elements booksliNode = doc.select("li[style]");
439 |         return queryBooks(booksliNode);
440 |     }
441 | 
442 |     private Set<Book> queryBooks(Elements booksliNode) {
443 |         Set<Book> books = new HashSet<>();
444 |         for (Element element : booksliNode) {
445 |             //获取书名和id
446 |             String name = null, id = null, author = null, publishDate = null, theme = null, detailBookClass = null;
447 |             BookClass bookBookClass;
448 |             Elements nameIdNode = element.select("p[class=name]");
449 |             if (nameIdNode != null) {
450 |                 name = nameIdNode.text();
451 |                 Elements idNode = nameIdNode.select("a[onclick]");
452 |                 if (idNode != null && idNode.size() > 0) {
453 |                     String idOnClick = idNode.get(0).attr("onclick");
454 |                     int start = idOnClick.indexOf("(") + 1, end = idOnClick.lastIndexOf(",");
455 |                     if (start != 0 && end != -1) {
456 |                         id = idOnClick.substring(start, end);
457 |                     }
458 |                 }
459 |             }
460 |             //获取分类
461 |             BookClass[] bookClasses = new BookClass[0];
462 |             Elements infoNode = element.select("p[class=info]");
463 |             if (infoNode != null) {
464 |                 Elements bookInfos = infoNode.select("a");
465 |                 if (bookInfos != null && bookInfos.size() > 0) {
466 |                     Element terminalCataNode = bookInfos.last();
467 |                     bookInfos.remove(terminalCataNode);
468 |                     List<BookClass> tmplist = bookInfos.stream()
469 |                             .map(bookInfo -> getBookCata(bookInfo, false))
470 |                             .filter(Objects::nonNull)
471 |                             .collect(Collectors.toList());
472 |                     BookClass terminalBookClass = getBookCata(terminalCataNode, true);
473 |                     if (terminalBookClass != null) {
474 |                         tmplist.add(terminalBookClass);
475 |                     }
476 |                     bookClasses = tmplist.toArray(bookClasses);
477 |                 }
478 |             }
479 |             bookBookClass = this.link(bookClasses);
480 | 
481 |             //获取作者，出版日期，主题词，分类
482 |             String info = element.text();
483 |             Pattern pattern = Pattern.compile("\\d+\\. (.*) 作者[:：](.*) 出版日期[:：](\\d+).*?(?:主题词[:：](.+))? 分类[:：](.*)");
484 |             Matcher matcher = pattern.matcher(info);
485 |             while (matcher.find()) {
486 |                 name = matcher.group(1);
487 |                 author = matcher.group(2);
488 |                 publishDate = matcher.group(3);
489 |                 theme = matcher.group(4);
490 |                 detailBookClass = matcher.group(5);
491 |             }
492 |             Pattern minPattern = Pattern.compile(".*(《.*》).*");
493 |             Matcher minMatcher = minPattern.matcher(info);
494 |             while (minMatcher.find()) {
495 |                 name = minMatcher.group(1);
496 |             }
497 | 
498 |             //汇总书本
499 |             if (name != null && id != null) {
500 |                 Book book = new Book(id, name, author, publishDate, theme, bookBookClass, detailBookClass);
501 |                 book.setCookie(cookie);
502 |                 books.add(book);
503 |                 if (bookBookClass.isTerminal()) {
504 |                     ((TerminalBookClass) bookBookClass).addBook(book);
505 |                 } else {
506 |                     System.out.println("未获取到分类信息，将不被归档 " + book);
507 |                 }
508 |             } else {
509 |                 System.out.println("error: " + info);
510 |             }
511 |         }
512 |         return books;
513 |     }
514 | 
515 | 
516 |     /**
517 |      * 通过HTML中对应节点获取到书所在分类
518 |      *
519 |      * @param bookInfo   书本信息的HTML节点
520 |      * @param isTerminal 是否是终端分类
521 |      * @return 书所在分类。如果是终端分类将会返回{@code TerminalBookClass}
522 |      */
523 |     private BookClass getBookCata(Element bookInfo, boolean isTerminal) {
524 |         String cataName = bookInfo.text();
525 |         String href = bookInfo.attr("href");
526 |         if (href != null) {
527 |             int cataIdStart = href.indexOf('=') + 1;
528 |             if (cataIdStart != 0) {
529 |                 String cataId = href.substring(href.indexOf('=') + 1, href.length());
530 |                 BookClass tmp = isTerminal ? new TerminalBookClass(cataId) : new BookClass(cataId);
531 |                 tmp.setName(cataName);
532 |                 return tmp;
533 |             }
534 | 
535 |         }
536 |         return null;
537 |     }
538 | 
539 | 
540 |     /**
541 |      * 从服务器查询当前分类下图书的数量。包含所有子分类下的图书
542 |      *
543 |      * @return 当前分类下图书的数量
544 |      * @throws IOException 查询失败
545 |      */
546 |     public int queryBooksSize() throws IOException {
547 |         checkCookie();
548 |         String data = "fenlei=" + this.getId() + "&mark=all&Page=1&totalnumber=-1";
549 |         String Url = NJULib.baseUrl + "/markbook/classifybookview.jsp";
550 |         String html = MyHttpRequest.postWithCookie(data, Url, null, cookie, "UTF-8", "GBK", 1000);
551 |         // System.out.println(html);
552 |         Document doc = Jsoup.parse(html);
553 |         Elements form = doc.select("input[name=totalnumber]");
554 |         if (!form.isEmpty()) {
555 |             String booksize = form.get(0).attr("value");
556 |             return Integer.parseInt(booksize);
557 |         }
558 |         return 0;
559 |     }
560 | 
561 | 
562 |     /**
563 |      * 检查{@code cookie}如果为null将会更新cookie
564 |      *
565 |      * @throws IOException 更新cookie失败
566 |      */
567 |     private void checkCookie() throws IOException {
568 |         cookie = (cookie == null) ? NJULib.getSession() : cookie;
569 |     }
570 | 
571 |     /**
572 |      * 重置{@code cookie}
573 |      *
574 |      * @throws IOException 重置cookie失败
575 |      */
576 |     private void resetCookie() throws IOException {
577 |         cookie = NJULib.getSession();
578 |     }
579 | 
580 |     /**
581 |      * 对当前分类添加子分类
582 |      *
583 |      * @param childBookClasses 顺次路径关系子分类，后一个是前一个的子分类。第一个是当前分类的子分类
584 |      * @return 子分类的最后一级分类.若子路径参数为空，则为当前分类
585 |      */
586 |     public BookClass link(BookClass... childBookClasses) {
587 |         BookClass currentBookClass = this;
588 |         for (BookClass bookClass : childBookClasses) {
589 |             BookClass previois = currentBookClass.addChild(bookClass);
590 |             if (previois != null) {
591 |                 currentBookClass = previois;
592 |             } else {
593 |                 bookClass.parent = currentBookClass;
594 |                 currentBookClass = bookClass;
595 |             }
596 |         }
597 |         return currentBookClass;
598 |     }
599 | 
600 |     /**
601 |      * 获取分类对象所有终端分类下已存储的书籍
602 |      * <p>
603 |      * 不会触发网络请求，只是迭代收集子分类的下已存在的书籍。
604 |      * 如要即时从服务器查询书籍，请调用{@link #queryAllBooks()}及其重载
605 |      *
606 |      * @return 该分类下属所有分类的图书集合
607 |      */
608 |     public Set<Book> getBooks() {
609 |         return this.getChildren().stream().map(BookClass::getBooks).collect(HashSet::new, Set::addAll, Set::addAll);
610 |     }
611 | 
612 | 
613 |     /**
614 |      * 判断两个{@code BookClass}是否是同一个分类。
615 |      * 仅根据代号即{@link BookClass#id}来判断
616 |      *
617 |      * @param obj 任意对象
618 |      * @return 对象是否是同一个分类
619 |      */
620 |     @Override
621 |     public boolean equals(Object obj) {
622 |         if (!(obj instanceof BookClass))
623 |             return false;
624 |         if (obj == this)
625 |             return true;
626 |         return this.id.equals(((BookClass) obj).id);
627 |     }
628 | 
629 |     /**
630 |      * 获取分类所在的路径。
631 |      * 返回可读的{@code String}，对二级分类到当前分类顺次所经路径分别调用{@link BookClass#toString()}，用"-"分割
632 |      *
633 |      * @return 从二级分类到当前分类顺次所经路径，用"-"分隔分类
634 |      */
635 |     public String getPath() {
636 |         Stack<BookClass> parents = new Stack<>();
637 |         BookClass bookClass = this;
638 |         while (bookClass!=null&&!bookClass.isRoot()) {
639 |             parents.push(bookClass);
640 |             bookClass = bookClass.getParent();
641 |         }
642 |         StringBuilder sb = new StringBuilder();
643 |         if (!parents.isEmpty()) {
644 |             sb.append(parents.pop().toString());
645 |         }
646 |         while (!parents.isEmpty()) {
647 |             sb.append("-");
648 |             sb.append(parents.pop().toString());
649 |         }
650 |         return sb.toString();
651 |     }
652 | 
653 |     /**
654 |      * 用于判断{@link BookClass}对象是不是{@link RootBookClass}的实例
655 |      *
656 |      * @return 是否是根分类
657 |      */
658 |     public boolean isRoot() {
659 |         return false;
660 |     }
661 | 
662 |     /**
663 |      * 返回{@code BookClass}的哈希值。
664 |      * 会直接使用用{@link #id}的哈希值
665 |      *
666 |      * @return 哈希值
667 |      */
668 |     @Override
669 |     public int hashCode() {
670 |         return id.hashCode();
671 |     }
672 | 
673 |     /**
674 |      * 返回{@code BookClass}的可读字符串描述。
675 |      *
676 |      * @return 格式是 "分类代号(分类名)"，如果分类名为null，则格式是"分类代号"
677 |      */
678 |     @Override
679 |     public String toString() {
680 |         return this.getId() + (this.getName() == null ? "" : "(" + this.getName() + ")");
681 |     }
682 | }
683 | 


--------------------------------------------------------------------------------
/src/main/java/com/njulib/object/Books.java:
--------------------------------------------------------------------------------
 1 | package com.njulib.object;
 2 | 
 3 | import java.util.Set;
 4 | 
 5 | /**
 6 |  * 书本查询的结果。{@link com.njulib.spider.BookSearch}类某些方法的返回值用到本类
 7 |  * 包含了查询出的图书当前页集合，以及查询结果的总页数，书本总数。
 8 |  *
 9 |  * @author padeoe
10 |  * @Date: 2016/12/09
11 |  */
12 | public class Books {
13 |     private int page;
14 |     private int totalNums;
15 |     private int totalPage;
16 |     private Set<Book> bookSet;
17 | 
18 |     /**
19 |      * @param page      当前页数
20 |      * @param totalPage 总页数
21 |      * @param totalNums 总书本数
22 |      * @param bookSet   本页的书
23 |      */
24 |     public Books(int page, int totalPage, int totalNums, Set<Book> bookSet) {
25 |         this.totalPage = totalPage;
26 |         this.bookSet = bookSet;
27 |     }
28 | 
29 |     /**
30 |      * 获取查询到的图书总数
31 |      *
32 |      * @return 查询到的图书总数
33 |      */
34 |     public int getTotalNums() {
35 |         return totalNums;
36 |     }
37 | 
38 |     public int getPage() {
39 |         return page;
40 |     }
41 | 
42 |     public int getTotalPage() {
43 |         return totalPage;
44 |     }
45 | 
46 |     public Set<Book> getBookSet() {
47 |         return bookSet;
48 |     }
49 | }
50 | 


--------------------------------------------------------------------------------
/src/main/java/com/njulib/object/InfoReader.java:
--------------------------------------------------------------------------------
 1 | package com.njulib.object;
 2 | 
 3 | import com.njulib.spider.BookDownloader;
 4 | 
 5 | import java.io.IOException;
 6 | import java.nio.charset.Charset;
 7 | import java.nio.file.Files;
 8 | import java.nio.file.Paths;
 9 | import java.util.List;
10 | import java.util.regex.Matcher;
11 | import java.util.regex.Pattern;
12 | 
13 | /**
14 |  * info文件解析器。
15 |  * <p>
16 |  * info文件是由{@link BookDownloader}在下载过程中创建的文本文件。
17 |  * 记录了一个{@link Book#toString()}
18 |  * 默认名称是{@link BookDownloader#INFO_FILE_NAME}。
19 |  * 该类会读取info文件并解析出{@link Book}对象
20 |  *
21 |  * @author padeoe
22 |  * @Date: 2016/12/11
23 |  */
24 | public class InfoReader {
25 |     private String infoFilePath;
26 | 
27 |     public InfoReader(String infoFilePath) {
28 |         this.infoFilePath = infoFilePath;
29 |     }
30 | 
31 |     /**
32 |      * 解析{@code Book}对象，如果未找到返回null
33 |      *
34 |      * @return {@code Book}对象
35 |      */
36 |     public Book read() {
37 |         try {
38 |             List<String> lines = Files.readAllLines(Paths.get(infoFilePath), Charset.forName("UTF-8") );
39 |             String info = "";
40 |             if (lines.size() > 0) {
41 |                 info = lines.get(0);
42 |             }
43 |             Pattern pattern = Pattern.compile("Book\\{id='(.*)', name='(.*)', author='(.*)', publishDate='(.*)', theme='(.*)', bookClass=(.*), detailBookClass='(.*)'\\}");
44 |             Matcher matcher = pattern.matcher(info);
45 |             if (matcher.find()) {
46 |                 return new Book(matcher.group(1),
47 |                         matcher.group(2),
48 |                         matcher.group(3),
49 |                         matcher.group(4),
50 |                         matcher.group(5),
51 |                         new BookClass(matcher.group(6)),
52 |                         matcher.group(7));
53 |             }
54 |             return null;
55 |         } catch (IOException e) {
56 |             e.printStackTrace();
57 |             return null;
58 |         }
59 |     }
60 | }
61 | 


--------------------------------------------------------------------------------
/src/main/java/com/njulib/object/RootBookClass.java:
--------------------------------------------------------------------------------
 1 | package com.njulib.object;
 2 | 
 3 | /**
 4 |  * 根分类
 5 |  * <p>
 6 |  * 根分类是在中图法分类之外虚拟出的分类。
 7 |  * 用于集合管理所有子分类，以及作为起点，从服务器获取子分类。
 8 |  *
 9 |  * @author padeoe
10 |  * @Date: 2016/12/20
11 |  */
12 | public class RootBookClass extends BookClass {
13 |     public RootBookClass() {
14 |         super("all");
15 |     }
16 | 
17 |     /**
18 |      * 用于判断{@link BookClass}对象是不是{@link RootBookClass}的实例
19 |      *
20 |      * @return true
21 |      */
22 |     @Override
23 |     public boolean isRoot() {
24 |         return true;
25 |     }
26 | 
27 |     /**
28 |      * 用于判断{@link BookClass}对象是不是{@link TerminalBookClass}的实例
29 |      *
30 |      * @return false
31 |      */
32 |     @Override
33 |     public boolean isTerminal() {
34 |         return false;
35 |     }
36 | }
37 | 


--------------------------------------------------------------------------------
/src/main/java/com/njulib/object/TerminalBookClass.java:
--------------------------------------------------------------------------------
 1 | package com.njulib.object;
 2 | 
 3 | import java.util.HashSet;
 4 | import java.util.Set;
 5 | 
 6 | /**
 7 |  * 终端分类。即分类的最末层。
 8 |  * <p>
 9 |  * 采用的是中图法分类，例如"哲学宗教-哲学理论-辩证唯物主义-总论"的最后一个"总论"就是一个终端分类。
10 |  * 只有终端分类下可以存储图书。
11 |  *
12 |  * @author padeoe
13 |  * @Date: 2016/12/20
14 |  */
15 | public class TerminalBookClass extends BookClass {
16 |     private Set<Book> books = new HashSet<>();
17 | 
18 |     /**
19 |      * 创建一个新初始化的{@code BookClass}对象，
20 |      * 使之中图法分类标识是{@code id}
21 |      *
22 |      * @param id 分类的中图法分类标识。
23 |      *           需要和<a href="http://114.212.7.104:8181/markbook/">南京大学馆藏数字化图书平台</a>定义的格式一致
24 |      */
25 |     public TerminalBookClass(String id) {
26 |         super(id);
27 |     }
28 | 
29 | 
30 |     /**
31 |      * 构造函数。
32 |      *
33 |      * @param id     分类编号
34 |      * @param name   分类名
35 |      * @param parent 父分类
36 |      */
37 |     public TerminalBookClass(String id, String name, BookClass parent) {
38 |         super(id, name, parent);
39 |     }
40 | 
41 |     /**
42 |      * 获取分类下的书籍
43 |      * 该方法只是返回该分类下现有书籍，不会向服务器查询该分类下所有图书。
44 |      * 如需向服务器查询，请调用{@link BookClass#queryAllBooks()}及其重载方法
45 |      *
46 |      * @return 分类下的书籍。
47 |      */
48 |     public Set<Book> getBooks() {
49 |         return books;
50 |     }
51 | 
52 |     /**
53 |      * 用于判断{@link BookClass}对象是不是{@link TerminalBookClass}的实例
54 |      *
55 |      * @return true
56 |      */
57 |     @Override
58 |     public boolean isTerminal() {
59 |         return true;
60 |     }
61 | 
62 | 
63 |     /**
64 |      * 用于判断{@link BookClass}对象是不是{@link RootBookClass}的实例
65 |      *
66 |      * @return false
67 |      */
68 |     @Override
69 |     public boolean isRoot() {
70 |         return false;
71 |     }
72 | 
73 |     /**
74 |      * 增加分类下图书
75 |      *
76 |      * @param book 图书
77 |      * @return 如果分类下已有该图书，将返回false。如果没有，将添加并返回true
78 |      */
79 |     public boolean addBook(Book book) {
80 |         return books.add(book);
81 |     }
82 | }
83 | 


--------------------------------------------------------------------------------
/src/main/java/com/njulib/object/exception/BookDLException.java:
--------------------------------------------------------------------------------
 1 | package com.njulib.object.exception;
 2 | 
 3 | import com.njulib.object.Book;
 4 | 
 5 | /**
 6 |  * 下载某一本书时发生错误。此异常发生在该书对应的文件夹创建之前。因此此书没有任何文件被下载。
 7 |  *
 8 |  * @author padeoe
 9 |  *         Date: 2016/12/12
10 |  */
11 | public class BookDLException extends Exception {
12 |     /**
13 |      * 发生下载错误的书籍
14 |      */
15 |     private Book book;
16 | 
17 |     /**
18 |      * 创意一个初始化的{@code BookDLException}，并指定发生错误的书籍。
19 |      *
20 |      * @param book 发生下载错误的书籍
21 |      */
22 |     public BookDLException(Book book) {
23 |         this.book = book;
24 |     }
25 | 
26 |     /**
27 |      * 获取发生下载错误的书籍
28 |      *
29 |      * @return 发生下载错误的书籍
30 |      */
31 |     public Book getBook() {
32 |         return book;
33 |     }
34 | }
35 | 


--------------------------------------------------------------------------------
/src/main/java/com/njulib/object/exception/BookPagesDLException.java:
--------------------------------------------------------------------------------
 1 | package com.njulib.object.exception;
 2 | 
 3 | import java.util.Vector;
 4 | 
 5 | /**
 6 |  * 下载某一本书时发生错误。
 7 |  * <p>
 8 |  * 此异常发生在书本对应文件夹已经创建之后。
 9 |  * 包含了此书所有的书页下载错误{@code PageDLException}，用于错误恢复
10 |  *
11 |  * @author padeoe
12 |  *         Date: 2016/12/10
13 |  */
14 | public class BookPagesDLException extends Exception {
15 |     Vector<PageDLException> pageDLExceptions;
16 | 
17 |     /**
18 |      * 构造一个{@code BookPagesDLException},用此书所有的书页下载错误初始化
19 |      *
20 |      * @param pageDLExceptionList 此书所有的书页下载错误
21 |      */
22 |     public BookPagesDLException(Vector<PageDLException> pageDLExceptionList) {
23 |         this.pageDLExceptions = pageDLExceptionList;
24 |     }
25 | 
26 |     /**
27 |      * 获取页错误的集合
28 |      *
29 |      * @return 此书所有的书页下载错误{@code PageDLException}
30 |      */
31 |     public Vector<PageDLException> getPageDLExceptions() {
32 |         return pageDLExceptions;
33 |     }
34 | }
35 | 


--------------------------------------------------------------------------------
/src/main/java/com/njulib/object/exception/PageDLException.java:
--------------------------------------------------------------------------------
 1 | package com.njulib.object.exception;
 2 | 
 3 | /**
 4 |  * 下载图书的某一页时失败。
 5 |  * <p>
 6 |  * 该类包含了错误现场的信息，可用于错误恢复与后期处理
 7 |  *
 8 |  * @author padeoe
 9 |  *         Date: 2016/12/10
10 |  */
11 | public class PageDLException extends Exception {
12 |     private String url;
13 |     private String location;
14 | 
15 |     /**
16 |      * 创建并初始化一个{@code PageDLException}对象。指定下载地址和存储地址。
17 |      *
18 |      * @param url      出错页图片的网络地址
19 |      * @param location 出错页图片本应存储的本地路径。不含图片后缀名
20 |      */
21 |     public PageDLException(String url, String location) {
22 |         super();
23 |         this.url = url;
24 |         this.location = location;
25 |     }
26 | 
27 |     /**
28 |      * 获取出错页的URL
29 |      *
30 |      * @return 出错页的URL
31 |      */
32 |     public String getUrl() {
33 |         return url;
34 |     }
35 | 
36 |     /**
37 |      * 获取出错页图片本应存储的本地路径。
38 |      *
39 |      * @return 出错页图片本应存储的本地路径。不含图片后缀名
40 |      */
41 |     public String getLocation() {
42 |         return location;
43 |     }
44 | 
45 |     @Override
46 |     public String toString() {
47 |         return "PageDLException{" +
48 |                 "url='" + url + '\'' +
49 |                 ", location='" + location + '\'' +
50 |                 '}';
51 |     }
52 | }
53 | 


--------------------------------------------------------------------------------
/src/main/java/com/njulib/spider/BookDownloader.java:
--------------------------------------------------------------------------------
  1 | package com.njulib.spider;
  2 | 
  3 | import com.njulib.fix.MissingPageCompletion;
  4 | import com.njulib.object.Book;
  5 | import com.njulib.object.InfoReader;
  6 | import com.njulib.object.exception.BookDLException;
  7 | import com.njulib.object.exception.BookPagesDLException;
  8 | import com.njulib.object.exception.PageDLException;
  9 | import org.jsoup.Jsoup;
 10 | import org.jsoup.nodes.Document;
 11 | import org.jsoup.nodes.Element;
 12 | import utils.network.MyHttpRequest;
 13 | import utils.network.ReturnData;
 14 | 
 15 | import java.io.*;
 16 | import java.nio.charset.StandardCharsets;
 17 | import java.nio.file.Path;
 18 | import java.nio.file.Paths;
 19 | import java.util.*;
 20 | import java.util.concurrent.atomic.AtomicInteger;
 21 | import java.util.regex.Matcher;
 22 | import java.util.regex.Pattern;
 23 | 
 24 | /**
 25 |  * 书本的下载器，分离了下载相关的函数及变量。
 26 |  *
 27 |  * @author padeoe
 28 |  *         Date: 2016/12/09
 29 |  */
 30 | public class BookDownloader {
 31 |     private String errorLogPath = ERROR_LOG_NAME;
 32 |     private int threadNumber = 5;
 33 | 
 34 |     /**
 35 |      * 获取下载器对应的{@code Book}
 36 |      *
 37 |      * @return 下载器对应的{@code Book}
 38 |      */
 39 |     public Book getBook() {
 40 |         return book;
 41 |     }
 42 | 
 43 |     private Book book;
 44 |     private Map<PageType, Integer> pageNumberMap;
 45 |     private String savePath = System.getProperty("user.dir");
 46 |     private Path directory;
 47 |     private String urlPrefix;
 48 |     private PageType[] pageTypes = {PageType.COVER, PageType.BOOKNAME, PageType.LEGALINFO, PageType.INTRODUCTION,
 49 |             PageType.DIRECTORY, PageType.CONTENT, PageType.APPENDIX, PageType.BACKCOVER};
 50 |     private AtomicInteger needDownload = new AtomicInteger(1);
 51 | 
 52 |     /**
 53 |      * 获取{@code Book}的页组成结构。
 54 |      *
 55 |      * @return 记录了每种{@link PageType}的数量。
 56 |      * @throws BookDLException 页组成获取失败，书本下载放弃
 57 |      */
 58 |     public Map<PageType, Integer> getPageNumberMap() throws BookDLException {
 59 |         if (pageNumberMap == null) {
 60 |             initialBookPara();
 61 |             return pageNumberMap;
 62 |         }
 63 |         return pageNumberMap;
 64 |     }
 65 | 
 66 |     /**
 67 |      * 获取{@code Book}图片的URL前缀
 68 |      *
 69 |      * @return {@code Book}图片的URL前缀
 70 |      * @throws BookDLException 前缀获取失败，书本下载被放弃。
 71 |      */
 72 |     public String getUrlPrefix() throws BookDLException {
 73 |         if (urlPrefix == null) {
 74 |             initialBookPara();
 75 |             return urlPrefix;
 76 |         }
 77 |         return urlPrefix;
 78 |     }
 79 | 
 80 |     /**
 81 |      * 错误日志的默认文件名
 82 |      */
 83 |     public static final String ERROR_LOG_NAME = "error.log";
 84 |     /**
 85 |      * 书本信息记录的默认文件名
 86 |      */
 87 |     public static final String INFO_FILE_NAME = "info.txt";
 88 | 
 89 |     /**
 90 |      * 创建指定{@code book}的下载器
 91 |      *
 92 |      * @param book 指定的书本
 93 |      */
 94 |     public BookDownloader(Book book) {
 95 |         this.book = book;
 96 |     }
 97 | 
 98 |     /**
 99 |      * 创建指定{@code Book}的下载器，将根据{@code bookid}创建{@link Book}对象
100 |      *
101 |      * @param bookid 书本id
102 |      */
103 |     BookDownloader(String bookid) {
104 |         this.book = new Book(bookid);
105 |     }
106 | 
107 |     /**
108 |      * 查看下载线程数
109 |      *
110 |      * @return 当前指定的下载线程数。默认为5
111 |      */
112 |     public int getThreadNumber() {
113 |         return threadNumber;
114 |     }
115 | 
116 |     /**
117 |      * 设置下载线程数。书本与书本之间将会依次单线程下载。书本的所有页将会采用多线程下载。
118 |      *
119 |      * @param threadNumber 线程数
120 |      */
121 |     public void setThreadNumber(int threadNumber) {
122 |         this.threadNumber = threadNumber;
123 |     }
124 | 
125 |     /**
126 |      * 设置文件夹名
127 |      *
128 |      * @param directoryString 文件夹名
129 |      */
130 |     public void setDirectory(String directoryString) {
131 |         String directoryName = directoryString.replaceAll("[/\\\\:\"*?<>|]", " ");
132 |         directory = Paths.get(savePath, directoryName);
133 |     }
134 | 
135 |     /**
136 |      * 设置保存路径
137 |      *
138 |      * @param savePath 下载保存路径
139 |      */
140 |     public void setSavePath(String savePath) {
141 |         this.savePath = savePath;
142 |     }
143 | 
144 |     /**
145 |      * 下载图片
146 |      *
147 |      * @param url      图片的url
148 |      * @param pathname 保存的路径，包括文件名(不含图片后缀),例如"C:/Users/username/a"，函数执行后会保存为"C:/Users/username/a.png"
149 |      * @throws IOException 下载出错
150 |      */
151 |     public static void downloadImage(String url, String pathname) throws IOException {
152 |         ReturnData returnData = MyHttpRequest.action_returnbyte("GET", null, url, null, null, null, 2000);
153 |         byte[] a = returnData.getData();
154 |         List<String> types = returnData.getHeaders().get("Content-Type");
155 |         String suffix = ".png";
156 |         if (types != null && types.get(0) != null) {
157 |             suffix = types.get(0).substring(types.get(0).indexOf('/') + 1, types.get(0).length()).toLowerCase();
158 |             suffix = suffix.equals("jpeg") ? ".jpg" : (suffix.equals("png") ? ".png" : suffix);
159 |         }
160 |         File file = new File(pathname + suffix);
161 |         BufferedOutputStream bf = new BufferedOutputStream(new FileOutputStream(file));
162 |         bf.write(a, 0, a.length);
163 |         bf.close();
164 |     }
165 | 
166 | 
167 |     /**
168 |      * 初始化下载参数，从服务器查询书本下载所需的参数,包括书页url，书本页数，页类型
169 |      * 执行后
170 |      *
171 |      * @throws BookDLException 查询参数出错，书本下载被终止
172 |      */
173 |     private void initialBookPara() throws BookDLException {
174 |         //获取页面地址
175 |         String url;
176 |         try {
177 |             url = book.getbookread();
178 |             getBookPara(url);
179 |         } catch (IOException e) {
180 |             e.printStackTrace();
181 |             throw new BookDLException(book);
182 |         }
183 |     }
184 | 
185 |     public String getBookViewPageHtml(String url) throws BookDLException {
186 |         if (url == null || url.length() == 0) {
187 |             throw new BookDLException(book);
188 |         }
189 |         //获取书本参数，包括下载地址前缀，页数
190 |         String html;
191 |         try {
192 |             html = MyHttpRequest.get(url, null, "UTF-8", 2000);
193 |         } catch (IOException e) {
194 |             e.printStackTrace();
195 |             throw new BookDLException(book);
196 |         }
197 |         return html;
198 |     }
199 | 
200 |     private void getBookPara(String url) throws BookDLException {
201 |         String html = getBookViewPageHtml(url);
202 |         Document doc = Jsoup.parse(html);
203 |         Element infoNode = doc.getElementsByTag("script").last();
204 |         pageNumberMap = new HashMap<>();
205 |         int epage = 0;
206 |         if (infoNode.dataNodes().size() > 0) {
207 |             String paraJs = infoNode.dataNodes().get(0).getWholeData();
208 |             Pattern pattern = Pattern.compile("var str='(.*)';.*epage = (\\d+);.*pages :\\[\\[1,(\\d+)\\],\\[1,(\\d+)\\],\\[1,(\\d+)\\]," +
209 |                     "\\[1,(\\d+)\\], \\[1,(\\d+)\\], \\[spage, epage\\], \\[1,(\\d+)\\], \\[1,(\\d+)\\]\\],.*", Pattern.DOTALL);
210 |             Matcher matcher = pattern.matcher(paraJs);
211 |             if (matcher.find()) {
212 |                 urlPrefix = matcher.group(1);
213 |                 pageNumberMap.put(pageTypes[5], Integer.parseInt(matcher.group(2)));
214 |                 pageNumberMap.put(pageTypes[0], Integer.parseInt(matcher.group(3)));
215 |                 pageNumberMap.put(pageTypes[1], Integer.parseInt(matcher.group(4)));
216 |                 pageNumberMap.put(pageTypes[2], Integer.parseInt(matcher.group(5)));
217 |                 pageNumberMap.put(pageTypes[3], Integer.parseInt(matcher.group(6)));
218 |                 pageNumberMap.put(pageTypes[4], Integer.parseInt(matcher.group(7)));
219 |                 pageNumberMap.put(pageTypes[6], Integer.parseInt(matcher.group(8)));
220 |                 pageNumberMap.put(pageTypes[7], Integer.parseInt(matcher.group(9)));
221 |             } else {
222 |                 throw new BookDLException(book);
223 |             }
224 |         } else {
225 |             System.out.println(book.getId() + " 参数获取失败");
226 |             throw new BookDLException(book);
227 |         }
228 |     }
229 | 
230 |     /**
231 |      * 通过书页页数判断是否是同一本书，如果是则补全info文件，如果不是则不执行操作
232 |      *
233 |      * @throws BookDLException 从服务器查询书本参数时出错
234 |      */
235 |     private void checkOldDirByPageSize() throws BookDLException {
236 |         File[] oldfiles = directory.toFile().listFiles();
237 |         int oldBookSize = oldfiles == null ? 0 : oldfiles.length;
238 |         //查询当前书本的页数
239 |         initialBookPara();
240 |         int newBookSize = pageNumberMap.values().stream().mapToInt(number -> number).sum();
241 |         //若书页数相同，假定为同一本书，帮他补全info文件
242 |         if (oldBookSize == newBookSize) {
243 |             logBookInfo();
244 |             System.out.println("已存在，跳过并补全了info文件" + book.toString());
245 |         }
246 |     }
247 | 
248 |     /**
249 |      * 开始创建文件夹并下载，该函数调用前保存路径以及文件夹名必须已经设置完毕。该环节有多个出口：
250 |      * 如果文件夹存在，将会调用{@link #handleOldDir()}进行下一步处理
251 |      * 如果文件夹不存在，将会初始化参数并调用{@link #downloadFromParaSetDone()} 进行下一步处理
252 |      *
253 |      * @throws BookPagesDLException 书本下载过程中发生了缺页
254 |      * @throws BookDLException      书本下载未开始
255 |      */
256 |     private void downloadFromMkdir() throws BookPagesDLException, BookDLException {
257 |         File path = directory.toFile();
258 |         //若目录存在，进入目录存在的处理例程
259 |         if (path.exists()) {
260 |             handleOldDir();
261 |             return;
262 |         }
263 |         //目录不存，准备下载。首先获取下载参数
264 |         System.out.println("开始下载 " + book);
265 |         //获取书本参数
266 |         initialBookPara();
267 |         if (!path.mkdirs()) {
268 |             System.out.println("文件夹创建失败");
269 |             throw new BookDLException(book);
270 |         }
271 |         downloadFromParaSetDone();
272 |     }
273 | 
274 |     /**
275 |      * 书本参数已经从服务器获取完毕，直接进行下载并保存。
276 |      *
277 |      * @throws BookPagesDLException 书本的某些页下载失败
278 |      */
279 |     private void downloadFromParaSetDone() throws BookPagesDLException {
280 |         Vector<PageDLException> pageDLExceptions = new Vector<>();
281 |         //首先顺序下载非正文内容
282 |         for (int i = 0; i < pageTypes.length; i++) {
283 |             if (i != 5) {
284 |                 try {
285 |                     download(pageTypes[i]);
286 |                 } catch (BookPagesDLException e) {
287 |                     pageDLExceptions.addAll(e.getPageDLExceptions());
288 |                 }
289 |             }
290 |         }
291 |         try {
292 |             downloadContent();
293 |             //日志记录书本信息
294 |             logBookInfo();
295 |         } catch (BookPagesDLException e) {
296 |             pageDLExceptions.addAll(e.getPageDLExceptions());
297 |         }
298 |         if (!pageDLExceptions.isEmpty()) {
299 |             throw new BookPagesDLException(pageDLExceptions);
300 |         }
301 |     }
302 | 
303 |     /**
304 |      * 下载文件夹已存在的处理函数。该函数会读取旧的文件夹下的info文件来判断待下载是不是同一本书。
305 |      * 该步骤有多个出口：
306 |      * 如果info文件不存在或效，将调用{@link #checkOldDirByPageSize()}做进一步判断
307 |      * 如果info文件存在且有效，读取info中书本id比对是否是同一本书：
308 |      * 如果是同一本书，将跳过；如果不是同一本书，将重新设置保存路径和文件夹名，并调用{@link #downloadFromMkdir()}进行下一步处理
309 |      *
310 |      * @throws BookPagesDLException 书本下载过程中发生了缺页
311 |      * @throws BookDLException      书本下载未开始
312 |      */
313 |     private void handleOldDir() throws BookPagesDLException, BookDLException {
314 |         //开始检查是否真的是重复还是同名而已，根据书的id判断
315 |         //读取info文件
316 |         Path infoFilePath = directory.resolve(INFO_FILE_NAME);
317 |         File infoFile = infoFilePath.toFile();
318 |         if (infoFile.exists()) {
319 |             //info文件存在，读取info文件记录的书本id
320 |             Book oldbook;
321 |             oldbook = new InfoReader(infoFilePath.toString()).read();
322 |             //读出了旧的书本信息
323 |             if (oldbook != null) {
324 |                 String oldBookId = oldbook.getId();
325 |                 //两本书是同一本书
326 |                 if (oldBookId.equals(book.getId())) {
327 |                     System.out.println("已存在，跳过" + book.toString());
328 |                     return;
329 |                 }
330 |                 //两本书是不同的书
331 |                 else {
332 |                     //如果两本书作者不同，文件夹添加作者名进行命名,并开始下载
333 |                     if (!book.getAuthor().equals(oldbook.getAuthor())) {
334 |                         setDirectory(book.getName() + "-" + book.getAuthor());
335 |                         downloadFromMkdir();
336 |                     }
337 |                     //如果两本书作者相同，用作者名加id命名
338 |                     else {
339 |                         setDirectory(book.getName() + "-" + book.getAuthor() + "-" + book.getId());
340 |                         downloadFromMkdir();
341 |                     }
342 |                 }
343 |             }
344 |             //info文件格式不正确，没有读出信息
345 |             //假定就文件夹是一本旧的书目,文件夹添加作者名进行命名,并开始下载
346 |             else {
347 |                 setDirectory(book.getName() + "-" + book.getAuthor());
348 |                 downloadFromMkdir();
349 |             }
350 |         } else {
351 |             //info文件不存在，比对书本页数数量是否是同一本书决定下一步操作
352 |          //   checkOldDirByPageSize();
353 |             System.out.println("将删除没有info文件的目录"+directory.getFileName());
354 |             if(deleteDir(directory.toFile())){
355 |                 downloadFromMkdir();
356 |             }
357 |             else{
358 |                 throw new BookDLException(this.book);
359 |             }
360 | 
361 |         }
362 |     }
363 | 
364 |     /**
365 |      * 递归删除目录下的所有文件及子目录下所有文件
366 |      * @param dir 将要删除的文件目录
367 |      * @return boolean Returns "true" if all deletions were successful.
368 |      *                 If a deletion fails, the method stops attempting to
369 |      *                 delete and returns "false".
370 |      */
371 |     public static boolean deleteDir(File dir) {
372 |         if (dir.isDirectory()) {
373 |             String[] children = dir.list();
374 |             //递归删除目录中的子目录下
375 |             for (int i=0; i<children.length; i++) {
376 |                 boolean success = deleteDir(new File(dir, children[i]));
377 |                 if (!success) {
378 |                     return false;
379 |                 }
380 |             }
381 |         }
382 |         // 目录此时为空，可以删除
383 |         return dir.delete();
384 |     }
385 | 
386 | 
387 |     /**
388 |      * 将书本下载保存为图片格式，书的每一页将会保存为一张图片
389 |      */
390 |     public void downloadAllImages() {
391 |         setDirectory(book.getName() != null ? book.getName() : book.getId());
392 |         try {
393 |             downloadFromMkdir();
394 |         } catch (BookDLException e) {
395 |             logBookFail(errorLogPath);//错误日志，记录未下载书籍
396 |         } catch (BookPagesDLException e) {
397 |             logPageFail(e, errorLogPath);//错误日志，记录单页下载失败
398 |         }
399 |     }
400 | 
401 | 
402 |     /**
403 |      * 在同文件夹下创建记录{@code Book}信息的文件，
404 |      * 文件名是""info.txt""
405 |      */
406 |     private void logBookInfo() {
407 |         try {
408 |             FileWriter fileWriter = new FileWriter(new File(directory.resolve("info.txt").toString()));
409 |             fileWriter.write(book.toString());
410 |             fileWriter.close();
411 |         } catch (IOException e) {
412 |             e.printStackTrace();
413 |         }
414 |     }
415 | 
416 |     /**
417 |      * 下载{@code PageType.CONTENT}部分所有的页，即正文部分。
418 |      * 不直接调用{@link #download(PageType)}下载正文部分是因为其采用了单线程下载。正文部分书页较多，因此本方法会使用多线程下载。
419 |      *
420 |      * @throws BookPagesDLException 书本的某些页下载失败
421 |      */
422 |     private void downloadContent() throws BookPagesDLException {
423 |         int firstPage = getFirstPage(PageType.CONTENT);//第一页的序号
424 |         final int pageSize = pageNumberMap.get(PageType.CONTENT);//正文总页数
425 |         //System.out.println("正文页码" + firstPage + "~" + lastPage);
426 |         needDownload.set(1);
427 |         Vector<PageDLException> pageDLExceptions = new Vector<>();
428 |         ArrayList<Thread> threadArrayList = new ArrayList<>();
429 |         for (int i = 0; i < threadNumber; i++) {
430 |             threadArrayList.add(new Thread() {
431 |                 @Override
432 |                 public void run() {
433 |                     super.run();
434 |                     while (true) {
435 |                         int downloading = needDownload.getAndIncrement();
436 |                         if (downloading <= pageSize) {
437 |                             //System.out.println("假装在下载 "+downloading);
438 |                             try {
439 |                                 download(PageType.CONTENT, downloading, String.format("%04d", firstPage + downloading - 1));
440 |                             } catch (PageDLException e) {
441 |                                 pageDLExceptions.add(e);
442 |                             }
443 |                         } else {
444 |                             break;
445 |                         }
446 |                     }
447 |                 }
448 |             });
449 |         }
450 |         for (Thread thread : threadArrayList) {
451 |             thread.start();
452 |         }
453 |         for (Thread thread : threadArrayList) {
454 |             try {
455 |                 thread.join();
456 |             } catch (InterruptedException e) {
457 |                 e.printStackTrace();
458 |             }
459 |         }
460 |         if (!pageDLExceptions.isEmpty()) {
461 |             throw new BookPagesDLException(pageDLExceptions);
462 |         }
463 |     }
464 | 
465 |     /**
466 |      * 下载某一种页类型的所有页。
467 |      *
468 |      * @param pageType 页类型
469 |      * @throws BookPagesDLException 某些页下载失败
470 |      */
471 |     private void download(PageType pageType) throws BookPagesDLException {
472 |         Vector<PageDLException> pageDLExceptions = new Vector<>();
473 |         int base = getFirstPage(pageType);
474 |         for (int i = 0; i < pageNumberMap.get(pageType); i++) {
475 |             try {
476 |                 download(pageType, i + 1, String.format("%04d", base + i));
477 |             } catch (PageDLException e) {
478 |                 pageDLExceptions.add(e);
479 |             }
480 |         }
481 |         if (!pageDLExceptions.isEmpty()) {
482 |             throw new BookPagesDLException(pageDLExceptions);
483 |         }
484 |     }
485 | 
486 |     /**
487 |      * 获取某一种类型页的第一页页码
488 |      *
489 |      * @param pageType 书页类型
490 |      * @return 相对于整本书的页码
491 |      */
492 |     private int getFirstPage(PageType pageType) {
493 |         int base = 1;//该种类型页的第一页的页码
494 |         for (PageType pageType1 : pageTypes) {
495 |             if (pageType1.equals(pageType)) {
496 |                 break;
497 |             } else {
498 |                 base += pageNumberMap.get(pageType1);
499 |             }
500 |         }
501 |         return base;
502 |     }
503 | 
504 |     /**
505 |      * 下载某一种页类型的特定页
506 |      *
507 |      * @param pageType 页类型
508 |      * @param page     图书的页码
509 |      * @throws PageDLException 某些页下载失败
510 |      */
511 |     private void download(PageType pageType, int page, String filename) throws PageDLException {
512 |         int pageNumberLength = 6 - pageType.name.length();
513 |         StringBuilder url = new StringBuilder();
514 |         url.append(urlPrefix).append(pageType.name);
515 |         for (int i = 0; i < pageNumberLength - String.valueOf(page).length(); i++) {
516 |             url.append('0');
517 |         }
518 |         url.append(page);
519 |         url.append(".jpg");
520 |         String finalurl = url.toString();
521 |         String pathname = directory.resolve(filename).toString();
522 |         try {
523 |             downloadImage(finalurl, pathname);
524 |         } catch (IOException e) {
525 |             try {
526 |                 downloadImage(finalurl, pathname);
527 |             } catch (IOException e1) {
528 |                 throw new PageDLException(finalurl, pathname);
529 |             }
530 | 
531 |         }
532 |     }
533 | 
534 |     /**
535 |      * 输出单页下载失败的日志，可以使用{@link MissingPageCompletion}来读取错误日志并恢复
536 |      *
537 |      * @param bookPagesDLException 单页失败异常
538 |      * @param pageFailLogPath      日志路径
539 |      */
540 |     private static void logPageFail(BookPagesDLException bookPagesDLException, String pageFailLogPath) {
541 |         Vector<PageDLException> pageDLExceptions = bookPagesDLException.getPageDLExceptions();
542 |         for (PageDLException pageDLException : pageDLExceptions) {
543 |             writeFile(pageFailLogPath, pageDLException.toString());
544 |         }
545 | 
546 |     }
547 | 
548 |     /**
549 |      * 输出整本书下载失败的日志，用于后期恢复(暂未完成)
550 |      *
551 |      * @param bookFailLogPath 日志路径
552 |      */
553 |     private void logBookFail(String bookFailLogPath) {
554 |         writeFile(bookFailLogPath, book.toString());
555 |     }
556 | 
557 |     public static void writeFile(String filepath, String content) {
558 |         Writer fstream = null;
559 |         try {
560 |             fstream = new OutputStreamWriter(new FileOutputStream(filepath, true), StandardCharsets.UTF_8);
561 |             fstream.write(content);
562 |             fstream.write(System.getProperty("line.separator"));
563 |             fstream.close();
564 |         }
565 |         catch (IOException e) {
566 |             e.printStackTrace();
567 |         }
568 |     }
569 | 
570 |     public void setErrorLogPath(String errorLogPath) {
571 |         this.errorLogPath = errorLogPath;
572 |     }
573 | 
574 |     /**
575 |      * 书页的类型。每本书都由"封面"，"正文"，"目录"等若干种固定的页类型组成。
576 |      */
577 |     public enum PageType {
578 |         COVER("cov", 1), BOOKNAME("bok", 2), LEGALINFO("leg", 3), INTRODUCTION("fow", 4), DIRECTORY("!", 5),
579 |         CONTENT("", 6), APPENDIX("att", 7), BACKCOVER("cov", 8);
580 |         private String name;
581 |         private int index;
582 | 
583 |         PageType(String name, int index) {
584 |             this.name = name;
585 |             this.index = index;
586 |         }
587 | 
588 |         public String getName() {
589 |             return name;
590 |         }
591 | 
592 |         public int getIndex() {
593 |             return index;
594 |         }
595 |     }
596 | }
597 | 


--------------------------------------------------------------------------------
/src/main/java/com/njulib/spider/BookSearch.java:
--------------------------------------------------------------------------------
  1 | package com.njulib.spider;
  2 | 
  3 | import com.njulib.object.Book;
  4 | import com.njulib.object.Books;
  5 | import com.njulib.object.RootBookClass;
  6 | import org.jsoup.Jsoup;
  7 | import org.jsoup.nodes.Document;
  8 | import org.jsoup.select.Elements;
  9 | import utils.network.MyHttpRequest;
 10 | 
 11 | import java.io.IOException;
 12 | import java.net.URLEncoder;
 13 | import java.util.HashMap;
 14 | import java.util.Map;
 15 | import java.util.Set;
 16 | 
 17 | /**
 18 |  * 查询符合条件的书籍。
 19 |  * <p>
 20 |  * 从<a href="http://114.212.7.104:8181/markbook/">南京大学馆藏数字化图书平台</a>查询符合条件的书籍。
 21 |  * 可通过书名或者sql语句查询书籍。
 22 |  * 可以在查询过程中动态创建图书的分类目录结构。
 23 |  *
 24 |  * @author padeoe
 25 |  * @Date: 2016/12/09
 26 |  */
 27 | public class BookSearch {
 28 |     String cookie;
 29 | 
 30 |     /**
 31 |      * 查询
 32 |      *
 33 |      * @throws IOException 查询失败
 34 |      */
 35 |     public BookSearch() throws IOException {
 36 |         this.cookie = NJULib.getSession();
 37 |     }
 38 | 
 39 |     /**
 40 |      * 通过指定sql查询的where子句进行图书查询
 41 |      *
 42 |      * @param sqlWhereClause 一些已知字段包括"书名","主题词","出版日期","作者"
 43 |      * @param page           查询结果列表的页码
 44 |      * @param rootBookClass  查询到的书本将会添加进该分类结构
 45 |      * @return 查询结果，包含查询到的书本列表，书本总数量和结果总页数
 46 |      * @throws IOException 查询失败
 47 |      */
 48 |     public Books searchBySQL(String sqlWhereClause, int page, RootBookClass rootBookClass) throws IOException {
 49 |         String url = NJULib.baseUrl + "/markbook/BookSearch.jsp";
 50 |         String data = "Page=" + page + "&MethodType=1" + "&Library=&KeyName=0&Condition=" + URLEncoder.encode(sqlWhereClause, "UTF-8") + "&Sort=&links=0&PSize=10&_=";
 51 |         Map<String, String> requestProperty = new HashMap<>();
 52 |         requestProperty.put("Content-type", "application/x-www-form-urlencoded; charset=UTF-8");
 53 |         String result = MyHttpRequest.postWithCookie(data, url, requestProperty, cookie, "UTF-8", "GBK", 2000);
 54 |         int totalNums = 0, totalPage = 0;
 55 |         Document doc = Jsoup.parse(result);
 56 |         Elements totalNumsNode = doc.select("input[name=TotalNums]");
 57 |         if (totalNumsNode != null && totalNumsNode.size() > 0) {
 58 |             totalNums = Integer.parseInt(totalNumsNode.get(0).attr("value"));
 59 |         }
 60 |         Elements totalPageNode = doc.select("a[href]:contains(末页)");
 61 |         if (totalPageNode != null && totalPageNode.size() > 0) {
 62 |             String href = totalPageNode.get(0).attr("href");
 63 |             int start = href.indexOf('(') + 1;
 64 |             int end = href.indexOf(')');
 65 |             if (start != 0 && end != -1) {
 66 |                 totalPage = Integer.parseInt(href.substring(start, end));
 67 |             }
 68 |         }
 69 |         Set<Book> books = rootBookClass.queryBooks(result);
 70 |         return new Books(page, totalPage, totalNums, books);
 71 |     }
 72 | 
 73 |     /**
 74 |      * 通过指定sql查询的where子句进行图书查询
 75 |      *
 76 |      * @param sqlWhereClause where子句，一些已知字段包括"书名","主题词","出版日期","作者"
 77 |      * @param page           查询结果列表的页码
 78 |      * @return 如果没有匹配结果，返回空的对象
 79 |      * @throws IOException 查询失败
 80 |      */
 81 |     public Books searchBySQL(String sqlWhereClause, int page) throws IOException {
 82 |         return searchBySQL(sqlWhereClause, page, new RootBookClass());
 83 |     }
 84 | 
 85 |     /**
 86 |      * 通过指定sql查询的where子句进行图书查询，只返回第一页结果。
 87 |      *
 88 |      * @param sqlWhereClause where子句，一些已知字段包括"书名","主题词","出版日期","作者"
 89 |      * @return 如果没有匹配结果，返回空的对象
 90 |      * @throws IOException 查询失败
 91 |      */
 92 |     public Books searchBySQL(String sqlWhereClause) throws IOException {
 93 |         return searchBySQL(sqlWhereClause, 1);
 94 |     }
 95 | 
 96 |     /**
 97 |      * 通过指定sql查询的where子句进行图书查询
 98 |      *
 99 |      * @param sqlWhereClause where子句，一些已知字段包括"书名","主题词","出版日期","作者"
100 |      * @return 查询结果，书的集合
101 |      * @throws IOException 查询失败
102 |      */
103 |     public Set<Book> findAllBySQL(String sqlWhereClause) throws IOException {
104 |         Set<Book> bookSet = null;
105 |         Books firstPageBooks = searchBySQL(sqlWhereClause, 1);
106 |         bookSet = firstPageBooks.getBookSet();
107 |         for (int i = 2; i <= firstPageBooks.getTotalPage(); i++) {
108 |             bookSet.addAll(searchBySQL(sqlWhereClause, i).getBookSet());
109 |         }
110 |         return bookSet;
111 |     }
112 | 
113 |     /**
114 |      * 通过指定sql查询的where子句进行图书查询,并把查询结果中的图书添加进分类结构
115 |      *
116 |      * @param sqlWhereClause where子句，一些已知字段包括"书名","主题词","出版日期","作者"
117 |      * @param rootBookClass  根分类
118 |      * @return 查询结果，书本集合
119 |      * @throws IOException 查询失败
120 |      */
121 |     public Set<Book> findAllBySQL(String sqlWhereClause, RootBookClass rootBookClass) throws IOException {
122 | 
123 |         Books firstPageBooks = searchBySQL(sqlWhereClause, 1, rootBookClass);
124 |         Set<Book> bookSet = firstPageBooks.getBookSet();
125 |         for (int i = 2; i <= firstPageBooks.getTotalPage(); i++) {
126 |             bookSet.addAll(searchBySQL(sqlWhereClause, i, rootBookClass).getBookSet());
127 |         }
128 |         return bookSet;
129 |     }
130 | 
131 |     private Books searchByName(String name) throws IOException {
132 |         return searchBySQL("书名 like '%" + name + "%' ");
133 |     }
134 | }
135 | 


--------------------------------------------------------------------------------
/src/main/java/com/njulib/spider/NJULib.java:
--------------------------------------------------------------------------------
 1 | package com.njulib.spider;
 2 | 
 3 | import utils.network.MyHttpRequest;
 4 | 
 5 | import java.io.IOException;
 6 | 
 7 | /**
 8 |  * 用于获取Session
 9 |  *
10 |  * @author padeoe
11 |  *         Date: 2016/12/08
12 |  */
13 | public class NJULib {
14 |     public static final String baseUrl = "http://114.212.7.104:8181";
15 | 
16 |     /**
17 |      * 获取SeesionId
18 |      *
19 |      * @return SeesionId
20 |      * @throws IOException 出现网络错误
21 |      */
22 |     public static String getSession() throws IOException {
23 |         System.out.println("正在重置cookie");
24 |         String Url = baseUrl + "/markbook/";
25 |         return MyHttpRequest.getAndGetCookie(Url, null, "UTF-8", 1000)[1];
26 |     }
27 | 
28 | }
29 | 


--------------------------------------------------------------------------------
/src/main/java/com/sslibrary/Start.java:
--------------------------------------------------------------------------------
 1 | package com.sslibrary;
 2 | 
 3 | import com.sslibrary.object.Book;
 4 | import com.sslibrary.object.BookClass;
 5 | import com.sslibrary.object.Books;
 6 | import com.sslibrary.object.RootBookClass;
 7 | import com.sslibrary.spider.BookSearch;
 8 | 
 9 | import java.io.IOException;
10 | import java.util.List;
11 | import java.util.Set;
12 | 
13 | /**
14 |  * @author padeoe
15 |  * @Date: 2016/12/10
16 |  */
17 | public class Start {
18 |     /**
19 |      * 一个使用示例。请修改下面代码的两个文件存储路径，再运行。
20 |      * 当前示例会下载计算机分类下所有书。
21 |      * 下载过程中可以终止程序从而终止下载。下一次下载时会跳过下载分类中已有的书本。
22 |      *
23 |      * @param args
24 |      */
25 |     public static void main(String[] args) {
26 |         //创建一个书目分类，此处定义的是0T0P3010 计算机类，具体解释请参考中图法
27 |         // 格式必须和<a href="http://114.212.7.104:8181/markbook/">南京大学馆藏数字化图书平台</a>一致
28 |         BookClass root=new BookClass("0N","自然科学总论",new RootBookClass());
29 |         try {
30 |             root.downloadWithCataDir("F:\\Book\\all",5,"F:\\error.log");
31 |             //      root.downloadWithCataDir("/opt/seafile/wkk_test/all",5,"/opt/seafile/wkk_test/error.log");
32 |         } catch (IOException e) {
33 |             e.printStackTrace();
34 |         }
35 |     }
36 | }
37 | 


--------------------------------------------------------------------------------
/src/main/java/com/sslibrary/fix/FileRenamer.java:
--------------------------------------------------------------------------------
  1 | package com.sslibrary.fix;
  2 | 
  3 | import java.io.File;
  4 | import java.io.FileInputStream;
  5 | import java.io.IOException;
  6 | import java.io.InputStream;
  7 | import java.nio.file.Files;
  8 | import java.nio.file.Path;
  9 | import java.nio.file.Paths;
 10 | import java.util.Arrays;
 11 | 
 12 | /**
 13 |  * 重命名之前版本程序下载的文件。
 14 |  * 之前版本造成了下载的文件命名不合理。
 15 |  *
 16 |  * @author padeoe
 17 |  * @Date: 2016/12/13
 18 |  */
 19 | public class FileRenamer {
 20 |     public static void main(String args[]) {
 21 |         renameZero("G:\\Test\\");
 22 |     }
 23 | 
 24 |     public static void renameZero(String rootDirPath) {
 25 |         Path root = Paths.get(rootDirPath);
 26 |         File rootDir = root.toFile();
 27 |         if (rootDir.isDirectory()) {
 28 |             File dirs[] = rootDir.listFiles();
 29 |             Arrays.asList(dirs).parallelStream().forEach(FileRenamer::handleEndDir);
 30 |         } else {
 31 |             System.out.println("根目录不是目录，终止");
 32 |         }
 33 |     }
 34 | 
 35 |     public static void handleEndDir(File dir) {
 36 |         if (dir.isDirectory()) {
 37 |             System.out.println("正在处理" + dir.getName());
 38 |             File files[] = dir.listFiles();
 39 |             Arrays.asList(files).parallelStream().forEach(file -> rename(dir, file));
 40 |         } else {
 41 |             System.out.println(dir.getName() + "不是目录，跳过");
 42 |         }
 43 |     }
 44 | 
 45 |     private static void rename(File dir, File file) {
 46 |         String name = file.getName();
 47 |         if (name.endsWith("png") || name.endsWith("jpg")) {
 48 |             String prefix = name.substring(0, name.indexOf('.'));
 49 |             name = name.replaceAll(prefix, String.format("%04d", Integer.parseInt(prefix)));
 50 |             try {
 51 |                 Files.move(file.toPath(), new File(dir.getPath() + "\\" + name).toPath());
 52 |             } catch (IOException e) {
 53 |                 System.out.println(file.toString());
 54 |             }
 55 |         }
 56 |     }
 57 | 
 58 |     public static void renameSuffix(String rootDirPath) {
 59 |         Path root = Paths.get(rootDirPath);
 60 |         File rootDir = root.toFile();
 61 |         if (rootDir.isDirectory()) {
 62 |             File dirs[] = rootDir.listFiles();
 63 |             Arrays.asList(dirs).parallelStream().forEach(FileRenamer::imageEndDir);
 64 |         } else {
 65 |             System.out.println("根目录不是目录，终止");
 66 |         }
 67 |     }
 68 | 
 69 |     public static void imageEndDir(File dir) {
 70 |         if (dir.isDirectory()) {
 71 |             System.out.println("正在处理" + dir.getName());
 72 |             File files[] = dir.listFiles();
 73 |             for (File file : files) {
 74 |                 String name = file.getName();
 75 |                 String prefix = name.substring(0, name.indexOf('.'));
 76 |                 String trueSuffix = getImageSuffix(file);
 77 |                 if ((name.endsWith("png") || name.endsWith("jpg")) && trueSuffix != null && !name.endsWith(trueSuffix)) {
 78 |                     name = prefix + "." + trueSuffix;
 79 |                     //  System.out.println("需要修改为"+name);
 80 |                     try {
 81 |                         Files.move(file.toPath(), new File(dir.getPath() + "\\" + name).toPath());
 82 |                     } catch (IOException e) {
 83 |                         System.out.println("修改出错" + file.toString());
 84 |                     }
 85 |                 }
 86 |             }
 87 |         } else {
 88 |             System.out.println(dir.getName() + "不是目录，跳过");
 89 |         }
 90 |     }
 91 | 
 92 | 
 93 |     public static String getImageSuffix(File image) {
 94 |         FileInputStream fileInputStream;
 95 |         InputStream inputStream;
 96 |         try {
 97 |             fileInputStream = new FileInputStream(image);
 98 |             inputStream = fileInputStream;
 99 |             byte[] array = new byte[10];
100 |             inputStream.read(array, 0, 10);
101 |             if (array[6] == 'J' && array[7] == 'F' && array[8] == 'I' && array[9] == 'F') {
102 |                 inputStream.close();
103 |                 return "jpg";
104 |             } else {
105 |                 inputStream.close();
106 |                 return "png";
107 |             }
108 |         } catch (IOException e) {
109 |             e.printStackTrace();
110 |         }
111 |         return null;
112 | 
113 |     }
114 | }
115 | 


--------------------------------------------------------------------------------
/src/main/java/com/sslibrary/fix/MissingPageCompletion.java:
--------------------------------------------------------------------------------
  1 | package com.sslibrary.fix;
  2 | 
  3 | import com.sslibrary.spider.BookDownloader;
  4 | 
  5 | import java.io.FileWriter;
  6 | import java.io.IOException;
  7 | import java.nio.file.Files;
  8 | import java.nio.file.Paths;
  9 | import java.util.Iterator;
 10 | import java.util.List;
 11 | import java.util.regex.Matcher;
 12 | import java.util.regex.Pattern;
 13 | 
 14 | /**
 15 |  * 读取下载日志中的错误，进行缺页补全。
 16 |  *
 17 |  * @author padeoe
 18 |  *         Date: 2016/12/09
 19 |  */
 20 | public class MissingPageCompletion {
 21 |     private String logLocation = Paths.get(System.getProperty("user.dir"), BookDownloader.ERROR_LOG_NAME).toString();
 22 |     private Pattern pattern = Pattern.compile("PageDLException\\{url='(.*)', location='(.*)'\\}");
 23 | 
 24 |     /**
 25 |      * 创建一个{@code MissingPageCompletion}对象并将日志路径指定为{@code logLocation}
 26 |      *
 27 |      * @param logLocation 日志文件路径
 28 |      */
 29 |     public MissingPageCompletion(String logLocation) {
 30 |         this.logLocation = logLocation;
 31 |     }
 32 | 
 33 |     /**
 34 |      * 读取日志中所有下载失败的单页信息并重新下载一次。
 35 |      * 重新下载的日志会输入到原日志文件中
 36 |      */
 37 |     public void complete() {
 38 |         try {
 39 |             List<String> lines = Files.readAllLines(Paths.get(logLocation));
 40 |             Iterator<String> iterator = lines.iterator();
 41 |             while (iterator.hasNext()) {
 42 |                 String line = iterator.next();
 43 |                 Matcher matcher = pattern.matcher(line);
 44 |                 String url, location;
 45 |                 if (matcher.find()) {
 46 |                     url = matcher.group(1);
 47 |                     location = matcher.group(2);
 48 |                     System.out.println(url + " " + location);
 49 |                     try {
 50 |                         BookDownloader.downloadImage(url, location);
 51 |                         iterator.remove();
 52 |                     } catch (IOException downloadFail) {
 53 |                     }
 54 |                 }
 55 |             }
 56 | 
 57 |             StringBuilder newLog = new StringBuilder();
 58 |             lines.forEach(line -> newLog.append(line).append(System.getProperty("line.separator")));
 59 |             FileWriter writer = new FileWriter(logLocation, false);
 60 |             writer.write(newLog.toString());
 61 |             writer.close();
 62 |         } catch (IOException e) {
 63 |             e.printStackTrace();
 64 |         }
 65 |     }
 66 | 
 67 |     /**
 68 |      * 获取当前指定的日志的位置。
 69 |      * 如果没有指定位置，将默认使用当前路径下的名为{@link BookDownloader#ERROR_LOG_NAME}的文件
 70 |      *
 71 |      * @return 当前指定的日志的位置
 72 |      */
 73 |     public String getLogLocation() {
 74 |         return logLocation;
 75 |     }
 76 | 
 77 |     /**
 78 |      * 指定输入的日志的位置
 79 |      *
 80 |      * @param logLocation 作为输入的日志的位置
 81 |      */
 82 |     public void setLogLocation(String logLocation) {
 83 |         this.logLocation = logLocation;
 84 |     }
 85 | 
 86 |     /**
 87 |      * 获取当前指定的错误日志的单行格式
 88 |      *
 89 |      * @return 错误日志的单行格式
 90 |      */
 91 |     public Pattern getPattern() {
 92 |         return pattern;
 93 |     }
 94 | 
 95 |     /**
 96 |      * 设置日志的单行格式
 97 |      *
 98 |      * @param pattern 日志的单行格式
 99 |      */
100 |     public void setPattern(Pattern pattern) {
101 |         this.pattern = pattern;
102 |     }
103 | 
104 |     public static void main(String[] args) {
105 |         new MissingPageCompletion("/opt/seafile/wkk_test/error.log").complete();
106 |     }
107 | }
108 | 


--------------------------------------------------------------------------------
/src/main/java/com/sslibrary/fix/Recovery.java:
--------------------------------------------------------------------------------
 1 | package com.sslibrary.fix;
 2 | 
 3 | import com.sslibrary.spider.BookDownloader;
 4 | 
 5 | import java.nio.file.Paths;
 6 | import java.util.regex.Pattern;
 7 | 
 8 | /**
 9 |  * 错误恢复类。
10 |  *
11 |  * 用于对读取错误日志，进行错误恢复。
12 |  * 错误主要包括页下载失败和书本下载失败两种。
13 |  * @author padeoe
14 |  * @Date 2017/1/11.
15 |  */
16 | public class Recovery {
17 |     private String logLocation = Paths.get(System.getProperty("user.dir"), BookDownloader.ERROR_LOG_NAME).toString();
18 |     private Pattern PageExceptionPattern = Pattern.compile("PageDLException\\{url='(.*)', location='(.*)'\\}");
19 |     private Pattern BookExceptionPattern = Pattern.compile("Book\\{id='(.*)', name='(.*)', author='(.*)', publishDate='(.*)', theme='(.*)', bookClass='(.*)', detailBookClass='(.*)'\\}");
20 |     private String bookRootLocation;
21 |     public static final String FIX_LOG_FILENAME="fix.log";
22 | 
23 |     /**
24 |      * 创建并初始化一个错误恢复对象。
25 |      *
26 |      * 指定错误日志文件的路径和书本下载的存储根路径
27 |      * @param logLocation 错误日志文件的路径
28 |      * @param bookRootLocation 书本下载存储路径的根分类路径
29 |      */
30 |     public Recovery(String logLocation, String bookRootLocation) {
31 |         this.logLocation = logLocation;
32 |         this.bookRootLocation = bookRootLocation;
33 |     }
34 | 
35 |     /**
36 |      * 读取错误日志，进行错误恢复
37 |      */
38 |     public void recover(){
39 | 
40 | 
41 |     }
42 | 
43 |     /**
44 |      * 设置页下载失败日志行的格式
45 |      * @param pageExceptionPattern 页下载失败日志行的格式
46 |      */
47 |     public void setPageExceptionPattern(Pattern pageExceptionPattern) {
48 |         PageExceptionPattern = pageExceptionPattern;
49 |     }
50 | 
51 |     /**
52 |      * 设置书本下载失败日志行的格式
53 |      * @param bookExceptionPattern 书本下载失败日志行的格式
54 |      */
55 |     public void setBookExceptionPattern(Pattern bookExceptionPattern) {
56 |         BookExceptionPattern = bookExceptionPattern;
57 |     }
58 | 
59 | }
60 | 


--------------------------------------------------------------------------------
/src/main/java/com/sslibrary/object/Book.java:
--------------------------------------------------------------------------------
  1 | package com.sslibrary.object;
  2 | 
  3 | import cn.chineseall.Node;
  4 | import com.sslibrary.object.exception.BookDLException;
  5 | import org.jsoup.Jsoup;
  6 | import org.jsoup.nodes.Document;
  7 | import org.jsoup.select.Elements;
  8 | import com.sslibrary.spider.BookDownloader;
  9 | import com.sslibrary.spider.NJULib;
 10 | import utils.network.MyHttpRequest;
 11 | 
 12 | import java.io.IOException;
 13 | import java.net.URLDecoder;
 14 | import java.util.List;
 15 | 
 16 | /**
 17 |  * 图书。
 18 |  * <p>
 19 |  * 对应<a href="http://114.212.7.104:8181/markbook/">南京大学馆藏数字化图书平台</a> 中的图书。
 20 |  *
 21 |  * @author padeoe
 22 |  * @Date: 2016/12/08
 23 |  */
 24 | public class Book {
 25 |     /**
 26 |      * 书的id,唯一识别号，是由<a href="http://114.212.7.104:8181/markbook/">南京大学馆藏数字化图书平台</a>定义的
 27 |      */
 28 |     private String id;
 29 |     /**
 30 |      * 书名，应该总是包含书名号《》
 31 |      */
 32 |     private String name;
 33 |     private String author;
 34 |     /**
 35 |      * 书本出本社
 36 |      */
 37 |     private String press;
 38 | 
 39 |     private String outlineUrl;
 40 | 
 41 |     public String getOutlineUrl() {
 42 |         return outlineUrl;
 43 |     }
 44 | 
 45 |     public void setOutlineUrl(String outlineUrl) {
 46 |         this.outlineUrl = outlineUrl;
 47 |     }
 48 | 
 49 |     private List<Node> outline;
 50 | 
 51 |     public List<Node> getOutline() {
 52 |         return outline;
 53 |     }
 54 | 
 55 |     public void setOutline(List<Node> outline) {
 56 |         this.outline = outline;
 57 |     }
 58 | 
 59 |     /**
 60 |      * 初始化一个新创建的{@code Book}对象。
 61 |      * <p>
 62 |      * 如果你没有足够的参数信息调用该方法创建对象,可调用{@link #getBookFromUrl(String)}通过书本的在线阅读地址获取实例，
 63 |      * 或者使用{@link com.sslibrary.spider.BookSearch}中的方法根据书名等字段查询并创建满足条件的的图书实例。
 64 |      *
 65 |      * @param id 书本id，需要和<a href="http://114.212.7.104:8181/markbook/">南京大学馆藏数字化图书平台</a>服务器一致
 66 |      */
 67 |     public Book(String id) {
 68 |         this.id = id;
 69 |     }
 70 | 
 71 |     /**
 72 |      * 获取书本的编号
 73 |      *
 74 |      * @return 书本编号
 75 |      */
 76 |     public String getId() {
 77 |         return id;
 78 |     }
 79 | 
 80 |     /**
 81 |      * 设置书本编号
 82 |      *
 83 |      * @param id 书本编号
 84 |      */
 85 |     public void setId(String id) {
 86 |         this.id = id;
 87 |     }
 88 | 
 89 |     /**
 90 |      * 获取书本名
 91 |      *
 92 |      * @return 书名，包含书名号《》
 93 |      */
 94 |     public String getName() {
 95 |         return name;
 96 |     }
 97 | 
 98 |     /**
 99 |      * 设置书名
100 |      *
101 |      * @param name 书名
102 |      */
103 |     public void setName(String name) {
104 |         this.name = name;
105 |     }
106 | 
107 |     /**
108 |      * 获取书本作者，可能是null
109 |      *
110 |      * @return 书本作者
111 |      */
112 |     public String getAuthor() {
113 |         return author;
114 |     }
115 | 
116 |     /**
117 |      * 设置书本作者
118 |      *
119 |      * @param author 书本作者
120 |      */
121 |     public void setAuthor(String author) {
122 |         this.author = author;
123 |     }
124 | 
125 |     /**
126 |      * 获取书本出版社
127 |      * @return
128 |      */
129 |     public String getPress() {
130 |         return press;
131 |     }
132 | 
133 |     /**
134 |      * 指定书本出版社
135 |      * @param press
136 |      */
137 |     public void setPress(String press) {
138 |         this.press = press;
139 |     }
140 | 
141 |     /**
142 |      * 获取书本出版日期
143 |      *
144 |      * @return 书本出版日期
145 |      */
146 |     public String getPublishDate() {
147 |         return publishDate;
148 |     }
149 | 
150 |     public void setPublishDate(String publishDate) {
151 |         this.publishDate = publishDate;
152 |     }
153 | 
154 |     /**
155 |      * 获取书本主题词，可能是null
156 |      *
157 |      * @return 书本主题词
158 |      */
159 |     public String getTheme() {
160 |         return theme;
161 |     }
162 | 
163 |     public void setTheme(String theme) {
164 |         this.theme = theme;
165 |     }
166 | 
167 |     /**
168 |      * 获取书本所在分类
169 |      *
170 |      * @return 书本所在分类
171 |      */
172 |     public BookClass getBookClass() {
173 |         return bookClass;
174 |     }
175 | 
176 |     public void setBookClass(BookClass bookClass) {
177 |         this.bookClass = bookClass;
178 |     }
179 | 
180 |     /**
181 |      * 获取书本所在末级分类
182 |      *
183 |      * @return 字符串描述所属分类，最末层的分类，用&gt;分割层级，
184 |      * 例如“数理科学和化学图书馆&gt;数学&gt;总论复分&gt;总论”
185 |      */
186 |     public String getDetailBookClass() {
187 |         return detailBookClass;
188 |     }
189 | 
190 |     public void setDetailBookClass(String detailBookClass) {
191 |         this.detailBookClass = detailBookClass;
192 |     }
193 | 
194 |     private String publishDate;
195 |     private String theme;
196 |     /**
197 |      * 所属分类
198 |      */
199 |     private BookClass bookClass = new RootBookClass();
200 |     /**
201 |      * 所属分类的中文描述。
202 |      * “>”分割层级，
203 |      * 例如“数理科学和化学图书馆>数学>总论复分>总论”
204 |      */
205 |     private String detailBookClass;
206 | 
207 |     public String getCookie() {
208 |         return cookie;
209 |     }
210 | 
211 |     void setCookie(String cookie) {
212 |         this.cookie = cookie;
213 |     }
214 | 
215 |     private String cookie;
216 | 
217 |     /**
218 |      * 初始化一个新创建的{@code Book}对象。需要{@code Book}的所有属性。
219 |      * 如果你没有足够的参数信息调用该方法创建对象,可调用{@link #getBookFromUrl(String)}通过书本的在线阅读地址获取实例，
220 |      * 或者使用{@link com.sslibrary.spider.BookSearch}中的方法根据书名等字段查询并创建满足条件的的图书实例。
221 |      *
222 |      * @param id              {@code Book}的id。该id是服务器命名的
223 |      * @param name            书名
224 |      * @param author          作者
225 |      * @param publishDate     出版日期
226 |      * @param theme           主题词
227 |      * @param bookClass       书本分类
228 |      * @param detailBookClass 书本分类分类名路径
229 |      */
230 |     public Book(String id, String name, String author, String publishDate, String theme, BookClass bookClass, String detailBookClass) {
231 |         this.id = id;
232 |         this.name = name;
233 |         this.author = author;
234 |         this.publishDate = publishDate;
235 |         this.theme = theme;
236 |         this.bookClass = bookClass;
237 |         this.detailBookClass = detailBookClass;
238 |     }
239 | 
240 | 
241 |     /**
242 |      * 通过在线阅览的地址来获取{@code Book}对象
243 |      *
244 |      * @param onlineReadUrl 书本的在线阅读地址
245 |      * @return Book对象，仅指定了id
246 |      */
247 |     public static Book getBookFromUrl(String onlineReadUrl) {
248 |         String[]para=onlineReadUrl.split("/");
249 |         if(para!=null&&para.length>6){
250 |             Book book=new Book(para[para.length-3]);
251 |             book.fillBookInfoByUrl(onlineReadUrl);
252 |             return book;
253 |         }
254 |         return null;
255 |     }
256 | 
257 |     /**
258 |      * 通过在线阅读页面补全{@code Book}的信息
259 |      * 仅可补全{@link #name},{@link #id},{@link #author},{@link  #publishDate}
260 |      *
261 |      * @param url 书本的在线阅读页面
262 |      */
263 |     public void fillBookInfoByUrl(String url) {
264 |         try {
265 |             String html = new BookDownloader(this,url).getBookViewPageHtml();
266 |             Document doc = Jsoup.parse(html);
267 |             Elements nameNode = doc.getElementsByTag("title");
268 |             this.name = nameNode.text().replaceAll("\\s+", " ");
269 |             Elements infoNode = doc.select("div[id=bookinfo]");
270 |             if(infoNode!=null){
271 |                 String bookinfo=infoNode.text();
272 |                 int author_end=bookinfo.indexOf(name);
273 |                 if(author_end!=-1){
274 |                     this.author=infoNode.text().substring(0,author_end-1);
275 |                 }
276 |                 String [] bookInfo=infoNode.text().substring(author_end,infoNode.text().length()).split(",");
277 |                 if(bookInfo.length>2){
278 |                     this.press=bookInfo[1];
279 |                     this.publishDate=bookInfo[2];
280 |                 }
281 |             }
282 |         } catch (BookDLException e) {
283 |             e.printStackTrace();
284 |         }catch (Exception e) {
285 |             e.printStackTrace();
286 |         }
287 |     }
288 | 
289 |     /**
290 |      * 获取书本的在线阅读地址。
291 |      *
292 |      * @return 书本在线与阅读的URL
293 |      * @throws IOException IO错误
294 |      */
295 |     public String getbookread() throws IOException {
296 |         resetCookie();
297 |         String para = "BID=" + id + "&ReadMode=0&pdfread=0&displaystyle=0";
298 |         String Url = NJULib.baseUrl + "/getbookread?" + para;
299 |         String result = MyHttpRequest.getWithCookie(Url, null, cookie, "UTF-8", 1000);
300 |         return NJULib.baseUrl + URLDecoder.decode(result, "UTF-8");
301 |     }
302 | 
303 |     /**
304 |      * 重置{@link #cookie}
305 |      *
306 |      * @throws IOException 重置cookie失败
307 |      */
308 |     private void resetCookie() throws IOException {
309 |         cookie = (cookie == null) ? NJULib.getSession() : cookie;
310 |     }
311 | 
312 |     @Override
313 |     public String toString() {
314 |         return "Book{" +
315 |                 "id='" + id + '\'' +
316 |                 ", name='" + name + '\'' +
317 |                 ", author='" + author + '\'' +
318 |                 ", press='" + press + '\'' +
319 |                 ", publishDate='" + publishDate + '\'' +
320 |                 ", theme='" + theme + '\'' +
321 |                 ", bookClass=" + bookClass +
322 |                 ", detailBookClass='" + detailBookClass + '\'' +
323 |                 '}';
324 |     }
325 | 
326 |     /**
327 |      * 下载该书。将下载许多图片，书的每一页都是一张png图片。
328 |      * 将会在{@code pathname}下创建一个以书名命名的文件夹，并存储所有图片。
329 |      * 错误日志将在当前路径下名为"error.log"
330 |      */
331 |     public void download(String onlineReadUrl) {
332 |         BookDownloader bookDownloader = new BookDownloader(this,onlineReadUrl);
333 |         bookDownloader.downloadAllImages();
334 |     }
335 | 
336 |     /**
337 |      * 下载该书。将下载许多图片，书的每一页都是一张png图片。
338 |      * 将会在{@code pathname}下创建一个以书名命名的文件夹，并存储所有图片。
339 |      * 错误日志将在当前路径下名为"error.log"
340 |      *
341 |      * @param pathname     下载存储目录
342 |      * @param threadNumber 下载线程数
343 |      */
344 |     public void download(String pathname, int threadNumber,String onlineReadUrl) {
345 |         BookDownloader bookDownloader = new BookDownloader(this,onlineReadUrl);
346 |         bookDownloader.setPath(pathname);
347 |         bookDownloader.setThreadNumber(threadNumber);
348 |         bookDownloader.downloadAllImages();
349 |     }
350 | 
351 |     /**
352 |      * 下载该书。将下载许多图片，书的每一页都是一张png图片。
353 |      * 将会在{@code pathname}下创建一个以书名命名的文件夹，并存储所有图片。
354 |      *
355 |      * @param pathname     下载存储目录
356 |      * @param threadNumber 线程数
357 |      * @param errorLogPath 错误日志路径
358 |      */
359 |     public void download(String pathname, int threadNumber, String onlineReadUrl,String errorLogPath) {
360 |         BookDownloader bookDownloader = new BookDownloader(this,onlineReadUrl);
361 |         bookDownloader.setPath(pathname);
362 |         bookDownloader.setThreadNumber(threadNumber);
363 |         bookDownloader.setErrorLogPath(errorLogPath);
364 |         bookDownloader.downloadAllImages();
365 |     }
366 | 
367 | 
368 |     @Override
369 |     public int hashCode() {
370 |         return Integer.parseInt(this.getId());
371 |     }
372 | 
373 |     @Override
374 |     public boolean equals(Object obj) {
375 |         if (!(obj instanceof Book))
376 |             return false;
377 |         if (obj == this)
378 |             return true;
379 |         return this.id.equals(((Book) obj).id);
380 |     }
381 | 
382 |     public com.njulib.object.Book cast(){
383 |         return new com.njulib.object.Book(id,name,author,publishDate,theme,null,null);
384 |     }
385 | }
386 | 


--------------------------------------------------------------------------------
/src/main/java/com/sslibrary/object/BookClass.java:
--------------------------------------------------------------------------------
  1 | package com.sslibrary.object;
  2 | 
  3 | import org.jsoup.Jsoup;
  4 | import org.jsoup.nodes.Document;
  5 | import org.jsoup.nodes.Element;
  6 | import org.jsoup.select.Elements;
  7 | import com.sslibrary.spider.BookDownloader;
  8 | import com.sslibrary.spider.NJULib;
  9 | import utils.conversion.MyDecoder;
 10 | import utils.network.MyHttpRequest;
 11 | 
 12 | import java.io.IOException;
 13 | import java.nio.file.Paths;
 14 | import java.util.*;
 15 | import java.util.concurrent.atomic.AtomicInteger;
 16 | import java.util.regex.Matcher;
 17 | import java.util.regex.Pattern;
 18 | import java.util.stream.Collectors;
 19 | 
 20 | /**
 21 |  * 图书的分类。
 22 |  * <p>
 23 |  * 对应<a href="http://114.212.7.104:8181/markbook/">南京大学馆藏数字化图书平台</a> 中的图书。
 24 |  * 同时分类名和分类编号满足中图法分类。是树结构。具有查询子分类和查询分类下书籍列表，批量下载分类书籍等功能。
 25 |  * 如果你没有足够信息构造实例，可以通过{@link RootBookClass}查询所有分类来获取实例。
 26 |  * 或者{@link com.sslibrary.spider.BookSearch}中的一些方法获取实例。
 27 |  *
 28 |  * @author padeoe
 29 |  * @Date: 2016/12/08
 30 |  */
 31 | public class BookClass {
 32 |     /**
 33 |      * 分类id，服务器定义的中图法分类id，
 34 |      * 例如"0T0P3010"
 35 |      */
 36 |     private String id;
 37 |     /**
 38 |      * 分类名称
 39 |      */
 40 |     private String name;
 41 |     /**
 42 |      * 父分类
 43 |      */
 44 |     private BookClass parent;
 45 |     /**
 46 |      * 子分类列表
 47 |      */
 48 |     private Map<String, BookClass> children;
 49 | 
 50 | 
 51 |     /**
 52 |      * 子分类{@link #children}是否已经被加载
 53 |      */
 54 |     private boolean isLoaded = false;
 55 | 
 56 |     /**
 57 |      * 查看当对象所使用的cookie
 58 |      *
 59 |      * @return cookie
 60 |      */
 61 |     public String getCookie() {
 62 |         return cookie;
 63 |     }
 64 | 
 65 |     /**
 66 |      * 设置{@code cookie},BookClass的子每一次子分类加载，
 67 |      * 书籍查询等操作都需要cookie，设置的cookie将会对所有子分类使用，
 68 |      * 以避免频繁获取cookie
 69 |      *
 70 |      * @param cookie cookie
 71 |      */
 72 |     public void setCookie(String cookie) {
 73 |         this.cookie = cookie;
 74 |     }
 75 | 
 76 |     /**
 77 |      * 查询分类信息时向服务器发送的cookie，初始=null。
 78 |      * 当调用了需要网络的方法时，将会被初始化。
 79 |      * 一个{@link BookClass}对象的所有子分类{@link #children}都是用的同一个cookie
 80 |      */
 81 |     private String cookie;
 82 | 
 83 |     /**
 84 |      * 获取子分类的数量
 85 |      *
 86 |      * @return 子分类的数量
 87 |      */
 88 |     public int getChildCount() {
 89 |         return children.size();
 90 |     }
 91 | 
 92 |     /**
 93 |      * 获取父分类
 94 |      *
 95 |      * @return 父分类。如果不存在则为null
 96 |      */
 97 |     public BookClass getParent() {
 98 |         return parent;
 99 |     }
100 | 
101 | 
102 |     /**
103 |      * 获取所有子分类。
104 |      * 初始为null,若要查看子分类，必须先调用{@link #loadChild()}或者{@link #loadAllChild()}从服务器查询并加载
105 |      *
106 |      * @return 子分类的集合
107 |      */
108 |     public Set<BookClass> getChildren() {
109 |         return children.values().stream().collect(Collectors.toSet());
110 |     }
111 | 
112 |     /**
113 |      * 查询特定子分类。
114 |      *
115 |      * @param idOrName 子分类的名称或者代号。符合中图法分类。
116 |      * @return 子分类
117 |      */
118 |     public BookClass getChild(String idOrName) {
119 |         return children.get(idOrName);
120 |     }
121 | 
122 |     public String getName() {
123 |         return name;
124 |     }
125 | 
126 |     public void setName(String name) {
127 |         this.name = name;
128 |     }
129 | 
130 |     public String getId() {
131 |         return id;
132 |     }
133 | 
134 |     public void setId(String id) {
135 |         this.id = id;
136 |     }
137 | 
138 |     public void setParent(BookClass parent) {
139 |         this.parent = parent;
140 |     }
141 | 
142 |     public boolean isTerminal() {
143 |         return false;
144 |     }
145 | 
146 |     /**
147 |      * 添加一个子分类
148 |      *
149 |      * @param bookClass 子分类
150 |      * @return 如果同id的子分类已存在，则返回之前的子分类，如果不存在，则添加并返回null
151 |      */
152 |     public BookClass addChild(BookClass bookClass) {
153 |         if (bookClass.name != null) {
154 |             children.putIfAbsent(bookClass.name, bookClass);
155 |         }
156 |         return children.putIfAbsent(bookClass.id, bookClass);
157 |     }
158 | 
159 |     /**
160 |      * 创建并初始化一个书本分类。指定分类编号，分类名称和父分类。
161 |      *
162 |      * @param id     分类编号
163 |      * @param name   分类名称
164 |      * @param parent 父分类
165 |      */
166 |     public BookClass(String id, String name, BookClass parent) {
167 |         this.id = id;
168 |         this.name = name;
169 |         this.parent = parent;
170 |         children = new HashMap<>();
171 |     }
172 | 
173 |     /**
174 |      * 创建并初始化一个书本分类。指定分类编号，分类名称和父分类。
175 |      *
176 |      * @param id     分类编号
177 |      * @param name   分类名称
178 |      */
179 |     public BookClass(String id, String name) {
180 |         this.id = id;
181 |         this.name = name;
182 |         children = new HashMap<>();
183 |     }
184 | 
185 | 
186 |     /**
187 |      * 创建一个新初始化的{@code BookClass}对象，
188 |      * 使之中图法分类标识是{@code id}
189 |      *
190 |      * @param id 分类的中图法分类标识。
191 |      *           需要和<a href="http://114.212.7.104:8181/markbook/">南京大学馆藏数字化图书平台</a>定义的格式一致
192 |      */
193 |     public BookClass(String id) {
194 |         this.id = id;
195 |         children = new HashMap<>();
196 |         this.isLoaded = false;
197 |     }
198 | 
199 |     /**
200 |      * 加载子分类。仅加载一层子分类，即子分类的子分类不会被加载。
201 |      * 当该方法被调用时，会向服务器查询该分类的子分类并更新该对象的{@link #children}
202 |      * <p>
203 |      * 如需递归加载子分类，调用{@link #loadAllChild()}
204 |      *
205 |      * @throws IOException 从服务器查询子节点出错
206 |      */
207 |     public void loadChild() throws IOException {
208 |         if (!isTerminal()) {
209 |             checkCookie();
210 |             String Url = NJULib.baseUrl + "/classifyview";
211 |             String data = "fenlei=" + this.getId() + "&lib=markbook";
212 |             String result = MyHttpRequest.postWithCookie(data, Url, null, cookie, "UTF-8", "UTF-8", 1000);
213 |             // System.out.println(result);
214 |             Document doc = Jsoup.parse(result);
215 |             Elements li = doc.getElementsByTag("li");
216 |             for (Element bookClassId : li) {
217 |                 String id = bookClassId.attr("id");
218 |                 String name = bookClassId.getElementsByTag("a").text();
219 |                 boolean hasSubTree = bookClassId.getElementsByTag("img").attr("onClick").contains("getSubTree");
220 |                 //System.out.println(id+" "+NJULib.decodeUrlUnicode(name));
221 |                 BookClass child = hasSubTree ? new BookClass(id, MyDecoder.decodeUrlUnicode(name), this) :
222 |                         new TerminalBookClass(id, MyDecoder.decodeUrlUnicode(name), this);
223 |                 child.setCookie(cookie);
224 |                 this.addChild(child);
225 |             }
226 |             this.isLoaded = true;
227 |         }
228 |     }
229 | 
230 | 
231 |     /**
232 |      * 迭代加载所有子分类。
233 |      * 直至加载到每个分类的末层分类。
234 |      *
235 |      * @throws IOException 从服务器查询时出错
236 |      */
237 |     public void loadAllChild() throws IOException {
238 |         if (!isTerminal()) {
239 |             loadChild();
240 |             for (BookClass child : getChildren()) {
241 |                 child.loadAllChild();
242 |             }
243 |         }
244 |     }
245 | 
246 | 
247 |     /**
248 |      * 下载分类下所有图书，会迭代测创建分类文件夹
249 |      *
250 |      * @param pathname     存储路径。将在该路径下创建多级分类目录并保存下载的图书
251 |      * @param threadNumber 线程数
252 |      * @param errorLogPath 错误日志路径
253 |      * @throws IOException 连接失败的错误
254 |      */
255 |     public void  downloadWithCataDir(String pathname, int threadNumber, String errorLogPath) throws IOException {
256 |         if (!isTerminal()) {
257 |             loadChild();
258 |             BookClass[]bookClasses=getChildren().toArray(new BookClass[]{});
259 |             for(int i=bookClasses.length-1;i>=0;i--){
260 |                 bookClasses[i].downloadWithCataDir(Paths.get(pathname, name == null ? id : name).toString(), threadNumber, errorLogPath);
261 |             }
262 | /*            for (BookClass child : getChildren()) {
263 |                 child.downloadWithCataDir(Paths.get(pathname, name == null ? id : name).toString(), threadNumber, errorLogPath);
264 |             }*/
265 |         } else {
266 |             downloadAllBooks(Paths.get(pathname, name == null ? id : name).toString(), threadNumber, errorLogPath);
267 |         }
268 |     }
269 | 
270 |     /**
271 |      * 下载分类下所有图书，会迭代测创建分类文件夹
272 |      * 下载存储路径为当前路径，线程数为5，错误日志将保存在当前路径，文件名为{@link BookDownloader#ERROR_LOG_NAME}
273 |      * 可以调用重载{@link #downloadWithCataDir(String, int, String)}设置参数
274 |      *
275 |      * @throws IOException 连接失败的错误
276 |      */
277 |     public void downloadWithCataDir() throws IOException {
278 |         downloadWithCataDir(System.getProperty("user.dir"), 5, Paths.get(System.getProperty("user.dir"), BookDownloader.ERROR_LOG_NAME).toString());
279 |     }
280 | 
281 |     /**
282 |      * 从服务器获取该分类下图书列表的第{@code page}页。
283 |      * 图书列表的分页是服务器做的，每页最多10条图书。
284 |      * <p>
285 |      * 页数的最大值可以根据{@link #queryBooksSize()}自行计算
286 |      *
287 |      * @param page 图书列表的页码
288 |      * @return 列表该页记录的图书
289 |      * @throws IOException 从服务器查询书本列表时出错
290 |      */
291 |     public Set<Book> queryBooks(int page) throws IOException {
292 |         checkCookie();
293 |         String data = "fenlei=" + this.id + "&mark=all&Page=" + page + "&totalnumber=-1";
294 |         String Url = NJULib.baseUrl + "/markbook/classifybookview.jsp";
295 |         String html = MyHttpRequest.postWithCookie(data, Url, null, cookie, "UTF-8", "GBK", 1000);
296 |         //   System.out.println(html);
297 |         Set<Book> books = queryBooks(html);
298 |         return books;
299 | 
300 |     }
301 | 
302 |     /**
303 |      * 获得某分类下的所有图书
304 |      *
305 |      * @return 分类下所有图书
306 |      * @throws IOException 从服务器查询书本列表时出错
307 |      */
308 |     public Set<Book> queryAllBooks() throws IOException {
309 |         return queryAllBooks(1);
310 |     }
311 | 
312 |     /**
313 |      * 获得分类下的所有图书
314 |      *
315 |      * @param threadNumber 线程数
316 |      * @return 图书集合
317 |      * @throws IOException 连接错误
318 |      */
319 |     public Set<Book> queryAllBooks(int threadNumber) throws IOException {
320 |         checkCookie();
321 |         String data = "fenlei=" + this.id + "&mark=all&Page=1&totalnumber=-1";
322 |         String Url = NJULib.baseUrl + "/markbook/classifybookview.jsp";
323 |         String html = MyHttpRequest.postWithCookie(data, Url, null, cookie, "UTF-8", "GBK", 1000);
324 |         //   System.out.println(html);
325 |         Document doc = Jsoup.parse(html);
326 |         Elements form = doc.select("a:contains(末页)");
327 | 
328 |         if (!form.isEmpty()) {
329 |             String keyword = form.get(0).attr("href");
330 |             String booksize = keyword.substring(keyword.lastIndexOf(",") + 1, keyword.length() - 1);
331 |             int size = Integer.parseInt(booksize);
332 |             System.out.println("一共 " + size + " 本书");
333 |             Set<Book> books = queryBooks(html);
334 |             List<PageGetThread> threadList = new ArrayList<>();
335 | 
336 |             AtomicInteger needGettedPage = new AtomicInteger(2);//需要获取的页码
337 |             int lastPage = size / 10 + 1;//最后一页的页码
338 |             //开始多线程刷所有页码
339 |             for (int threadN = 0; threadN < threadNumber; threadN++) {
340 |                 threadList.add(new PageGetThread(needGettedPage, lastPage));
341 |             }
342 | 
343 |             for (PageGetThread thread : threadList) {
344 |                 thread.start();
345 |             }
346 |             for (PageGetThread thread : threadList) {
347 |                 try {
348 |                     thread.join();
349 |                 } catch (InterruptedException e) {
350 |                     e.printStackTrace();
351 |                 }
352 |             }
353 |             threadList.forEach(pageGetThread -> books.addAll(pageGetThread.getThreadBooks()));
354 |             return books;
355 |         }
356 |         return null;
357 |     }
358 | 
359 |     /**
360 |      * 下载分类下所有图书。
361 |      * 所有书籍将直接保存在{@code pathname}目录下，每本书一个文件夹，以书名命名。如同名，则加作者名，如又同名，加书本编号
362 |      *
363 |      * @param pathname     存储路径。书本文件夹所在的上级路径
364 |      * @param threadNumber 线程数
365 |      * @param errorLogPath 错误日志路径
366 |      * @throws IOException 连接失败的错误
367 |      */
368 |     public void downloadAllBooks(String pathname, int threadNumber, String errorLogPath) throws IOException {
369 |         checkCookie();
370 |         String data = "fenlei=" + this.id + "&mark=all&Page=1&totalnumber=-1";
371 |         String Url = NJULib.baseUrl + "/markbook/classifybookview.jsp";
372 |         String html = MyHttpRequest.postWithCookie(data, Url, null, cookie, "UTF-8", "GBK", 1000);
373 |         //   System.out.println(html);
374 |         Document doc = Jsoup.parse(html);
375 |         Elements form = doc.select("a:contains(末页)");
376 |         if (!form.isEmpty()) {
377 |             String keyword = form.get(0).attr("href");
378 |             String booksize = keyword.substring(keyword.lastIndexOf(",") + 1, keyword.length() - 1);
379 |             int size = Integer.parseInt(booksize);
380 |             System.out.println(this.getPath()+"一共 " + size + " 本书");
381 |             Set<Book> books = queryBooks(html);
382 |             Set<Book> downloading;
383 |             downloadBooks(books, pathname, threadNumber, errorLogPath);
384 |             int lastPage = size / 10 + 1;//最后一页的页码
385 |             int index = 1;
386 |             for (int i = lastPage; i >= 2; i--) {
387 |                 downloading = queryBooks(i);
388 |                 for (Book book : downloading) {
389 |                     if (books.add(book)) {
390 |                         book.download(pathname, threadNumber, errorLogPath);
391 |                         index++;
392 |                     } else {
393 |                         System.out.println("服务器返回了重复书籍，跳过 " + book);
394 |                     }
395 |                 }
396 |             }
397 |             System.out.println("去重后共" + books.size() + "书，实际下载了" + (index + 10) + "本书(含失败)");
398 |         }
399 |     }
400 | 
401 |     private void downloadBooks(Set<Book> books, String pathname, int threadNumber, String errorLogPath) {
402 |         for (Book book : books) {
403 |             book.download(pathname, threadNumber, errorLogPath);
404 |         }
405 |     }
406 | 
407 | 
408 |     /**
409 |      * 获取所有图书列表的线程
410 |      */
411 |     class PageGetThread extends Thread {
412 |         Set<Book> books = new HashSet<>();
413 |         AtomicInteger needGettedPage;
414 |         int lastPage;
415 | 
416 |         public PageGetThread(AtomicInteger needGettedPage, int lastPage) {
417 |             this.needGettedPage = needGettedPage;
418 |             this.lastPage = lastPage;
419 |         }
420 | 
421 |         @Override
422 |         public void run() {
423 |             while (true) {
424 |                 int gettingpage = needGettedPage.getAndIncrement();
425 |                 if (gettingpage <= lastPage) {
426 |                     try {
427 |                         if (gettingpage % 10 == 0) {
428 |                             resetCookie();
429 |                         }
430 |                         books.addAll(queryBooks(gettingpage));
431 |                     } catch (IOException e) {
432 |                         e.printStackTrace();
433 |                     }
434 |                 } else {
435 |                     break;
436 |                 }
437 |             }
438 |         }
439 | 
440 |         public Set<Book> getThreadBooks() {
441 |             return books;
442 |         }
443 |     }
444 | 
445 | 
446 |     /**
447 |      * 获取HTML文本中的书籍并根据其分类添加进当前的分类结构
448 |      *
449 |      * @param html 服务器特定页面返回的包含书本信息的HTML文本。
450 |      *             服务器多个不同页面返回的包含书本信息的HTML中书本信息相关节点的格式都相似。均可调用本函数
451 |      * @return HTML中记录的书本
452 |      */
453 |     public Set<Book> queryBooks(String html) {
454 |         Document doc = Jsoup.parse(html);
455 |         Elements booksliNode = doc.select("li[style]");
456 |         return queryBooks(booksliNode);
457 |     }
458 | 
459 |     private Set<Book> queryBooks(Elements booksliNode) {
460 |         Set<Book> books = new HashSet<>();
461 |         for (Element element : booksliNode) {
462 |             //获取书名和id
463 |             String name = null, id = null, author = null, publishDate = null, theme = null, detailBookClass = null;
464 |             BookClass bookBookClass;
465 |             Elements nameIdNode = element.select("p[class=name]");
466 |             if (nameIdNode != null) {
467 |                 name = nameIdNode.text();
468 |                 Elements idNode = nameIdNode.select("a[onclick]");
469 |                 if (idNode != null && idNode.size() > 0) {
470 |                     String idOnClick = idNode.get(0).attr("onclick");
471 |                     int start = idOnClick.indexOf("(") + 1, end = idOnClick.lastIndexOf(",");
472 |                     if (start != 0 && end != -1) {
473 |                         id = idOnClick.substring(start, end);
474 |                     }
475 |                 }
476 |             }
477 |             //获取分类
478 |             BookClass[] bookClasses = new BookClass[0];
479 |             Elements infoNode = element.select("p[class=info]");
480 |             if (infoNode != null) {
481 |                 Elements bookInfos = infoNode.select("a");
482 |                 if (bookInfos != null && bookInfos.size() > 0) {
483 |                     Element terminalCataNode = bookInfos.last();
484 |                     bookInfos.remove(terminalCataNode);
485 |                     List<BookClass> tmplist = bookInfos.stream()
486 |                             .map(bookInfo -> getBookCata(bookInfo, false))
487 |                             .filter(Objects::nonNull)
488 |                             .collect(Collectors.toList());
489 |                     BookClass terminalBookClass = getBookCata(terminalCataNode, true);
490 |                     if (terminalBookClass != null) {
491 |                         tmplist.add(terminalBookClass);
492 |                     }
493 |                     bookClasses = tmplist.toArray(bookClasses);
494 |                 }
495 |             }
496 |             bookBookClass = new RootBookClass().link(bookClasses);
497 | 
498 |             //获取作者，出版日期，主题词，分类
499 |             String info = element.text();
500 |             Pattern pattern = Pattern.compile("\\d+\\. (.*) 作者[:：](.*) 出版日期[:：](\\d+).*?(?:主题词[:：](.+))? 分类[:：](.*)");
501 |             Matcher matcher = pattern.matcher(info);
502 |             while (matcher.find()) {
503 |                 name = matcher.group(1);
504 |                 author = matcher.group(2);
505 |                 publishDate = matcher.group(3);
506 |                 theme = matcher.group(4);
507 |                 detailBookClass = matcher.group(5);
508 |             }
509 |             Pattern minPattern = Pattern.compile(".*(《.*》).*");
510 |             Matcher minMatcher = minPattern.matcher(info);
511 |             while (minMatcher.find()) {
512 |                 name = minMatcher.group(1);
513 |             }
514 | 
515 |             //汇总书本
516 |             if (name != null && id != null) {
517 |                 Book book = new Book(id, name, author, publishDate, theme, bookBookClass, detailBookClass);
518 |                 book.setCookie(cookie);
519 |                 books.add(book);
520 |                 if (bookBookClass.isTerminal()) {
521 |                     ((TerminalBookClass) bookBookClass).addBook(book);
522 |                 } else {
523 |                     System.out.println("未获取到分类信息，将不被归档 " + book);
524 |                 }
525 |             } else {
526 |                 System.out.println("error: " + info);
527 |             }
528 |         }
529 |         return books;
530 |     }
531 | 
532 | 
533 |     /**
534 |      * 通过HTML中对应节点获取到书所在分类
535 |      *
536 |      * @param bookInfo   书本信息的HTML节点
537 |      * @param isTerminal 是否是终端分类
538 |      * @return 书所在分类。如果是终端分类将会返回{@code TerminalBookClass}
539 |      */
540 |     private BookClass getBookCata(Element bookInfo, boolean isTerminal) {
541 |         String cataName = bookInfo.text();
542 |         String href = bookInfo.attr("href");
543 |         if (href != null) {
544 |             int cataIdStart = href.indexOf('=') + 1;
545 |             if (cataIdStart != 0) {
546 |                 String cataId = href.substring(href.indexOf('=') + 1, href.length());
547 |                 BookClass tmp = isTerminal ? new TerminalBookClass(cataId) : new BookClass(cataId);
548 |                 tmp.setName(cataName);
549 |                 return tmp;
550 |             }
551 | 
552 |         }
553 |         return null;
554 |     }
555 | 
556 | 
557 |     /**
558 |      * 从服务器查询当前分类下图书的数量。包含所有子分类下的图书
559 |      *
560 |      * @return 当前分类下图书的数量
561 |      * @throws IOException 查询失败
562 |      */
563 |     public int queryBooksSize() throws IOException {
564 |         checkCookie();
565 |         String data = "fenlei=" + this.getId() + "&mark=all&Page=1&totalnumber=-1";
566 |         String Url = NJULib.baseUrl + "/markbook/classifybookview.jsp";
567 |         String html = MyHttpRequest.postWithCookie(data, Url, null, cookie, "UTF-8", "GBK", 1000);
568 |         // System.out.println(html);
569 |         Document doc = Jsoup.parse(html);
570 |         Elements form = doc.select("input[name=totalnumber]");
571 |         if (!form.isEmpty()) {
572 |             String booksize = form.get(0).attr("value");
573 |             return Integer.parseInt(booksize);
574 |         }
575 |         return 0;
576 |     }
577 | 
578 | 
579 |     /**
580 |      * 检查{@code cookie}如果为null将会更新cookie
581 |      *
582 |      * @throws IOException 更新cookie失败
583 |      */
584 |     private void checkCookie() throws IOException {
585 |         cookie = (cookie == null) ? NJULib.getSession() : cookie;
586 |     }
587 | 
588 |     /**
589 |      * 重置{@code cookie}
590 |      *
591 |      * @throws IOException 重置cookie失败
592 |      */
593 |     private void resetCookie() throws IOException {
594 |         cookie = NJULib.getSession();
595 |     }
596 | 
597 |     /**
598 |      * 对当前分类添加子分类
599 |      *
600 |      * @param childBookClasses 顺次路径关系子分类，后一个是前一个的子分类。第一个是当前分类的子分类
601 |      * @return 子分类的最后一级分类.若子路径参数为空，则为当前分类
602 |      */
603 |     public BookClass link(BookClass... childBookClasses) {
604 |         BookClass currentBookClass = this;
605 |         for (BookClass bookClass : childBookClasses) {
606 |             BookClass previois = currentBookClass.addChild(bookClass);
607 |             if (previois != null) {
608 |                 currentBookClass = previois;
609 |             } else {
610 |                 bookClass.parent = currentBookClass;
611 |                 currentBookClass = bookClass;
612 |             }
613 |         }
614 |         return currentBookClass;
615 |     }
616 | 
617 |     /**
618 |      * 获取分类对象所有终端分类下已存储的书籍
619 |      * <p>
620 |      * 不会触发网络请求，只是迭代收集子分类的下已存在的书籍。
621 |      * 如要即时从服务器查询书籍，请调用{@link #queryAllBooks()}及其重载
622 |      *
623 |      * @return 该分类下属所有分类的图书集合
624 |      */
625 |     public Set<Book> getBooks() {
626 |         return this.getChildren().stream().map(BookClass::getBooks).collect(HashSet::new, Set::addAll, Set::addAll);
627 |     }
628 | 
629 | 
630 |     /**
631 |      * 判断两个{@code BookClass}是否是同一个分类。
632 |      * 仅根据代号即{@link BookClass#id}来判断
633 |      *
634 |      * @param obj 任意对象
635 |      * @return 对象是否是同一个分类
636 |      */
637 |     @Override
638 |     public boolean equals(Object obj) {
639 |         if (!(obj instanceof BookClass))
640 |             return false;
641 |         if (obj == this)
642 |             return true;
643 |         return this.id.equals(((BookClass) obj).id);
644 |     }
645 | 
646 |     /**
647 |      * 获取分类所在的路径。
648 |      * 返回可读的{@code String}，对二级分类到当前分类顺次所经路径分别调用{@link BookClass#toString()}，用"-"分割
649 |      *
650 |      * @return 从二级分类到当前分类顺次所经路径，用"-"分隔分类
651 |      */
652 |     public String getPath() {
653 |         Stack<BookClass> parents = new Stack<>();
654 |         BookClass bookClass = this;
655 |         while (bookClass!=null&&!bookClass.isRoot()) {
656 |             parents.push(bookClass);
657 |             bookClass = bookClass.getParent();
658 |         }
659 |         StringBuilder sb = new StringBuilder();
660 |         if (!parents.isEmpty()) {
661 |             sb.append(parents.pop().toString());
662 |         }
663 |         while (!parents.isEmpty()) {
664 |             sb.append("-");
665 |             sb.append(parents.pop().toString());
666 |         }
667 |         return sb.toString();
668 |     }
669 | 
670 |     /**
671 |      * 用于判断{@link BookClass}对象是不是{@link RootBookClass}的实例
672 |      *
673 |      * @return 是否是根分类
674 |      */
675 |     public boolean isRoot() {
676 |         return false;
677 |     }
678 | 
679 |     /**
680 |      * 返回{@code BookClass}的哈希值。
681 |      * 会直接使用用{@link #id}的哈希值
682 |      *
683 |      * @return 哈希值
684 |      */
685 |     @Override
686 |     public int hashCode() {
687 |         return id.hashCode();
688 |     }
689 | 
690 |     /**
691 |      * 返回{@code BookClass}的可读字符串描述。
692 |      *
693 |      * @return 格式是 "分类代号(分类名)"，如果分类名为null，则格式是"分类代号"
694 |      */
695 |     @Override
696 |     public String toString() {
697 |         return this.getId() + (this.getName() == null ? "" : "(" + this.getName() + ")");
698 |     }
699 | }
700 | 


--------------------------------------------------------------------------------
/src/main/java/com/sslibrary/object/Books.java:
--------------------------------------------------------------------------------
 1 | package com.sslibrary.object;
 2 | 
 3 | import java.util.Set;
 4 | 
 5 | /**
 6 |  * 书本查询的结果。{@link com.sslibrary.spider.BookSearch}类某些方法的返回值用到本类
 7 |  * 包含了查询出的图书当前页集合，以及查询结果的总页数，书本总数。
 8 |  *
 9 |  * @author padeoe
10 |  * @Date: 2016/12/09
11 |  */
12 | public class Books {
13 |     private int page;
14 |     private int totalNums;
15 |     private int totalPage;
16 |     private Set<Book> bookSet;
17 | 
18 |     /**
19 |      * @param page      当前页数
20 |      * @param totalPage 总页数
21 |      * @param totalNums 总书本数
22 |      * @param bookSet   本页的书
23 |      */
24 |     public Books(int page, int totalPage, int totalNums, Set<Book> bookSet) {
25 |         this.totalPage = totalPage;
26 |         this.bookSet = bookSet;
27 |     }
28 | 
29 |     /**
30 |      * 获取查询到的图书总数
31 |      *
32 |      * @return 查询到的图书总数
33 |      */
34 |     public int getTotalNums() {
35 |         return totalNums;
36 |     }
37 | 
38 |     public int getPage() {
39 |         return page;
40 |     }
41 | 
42 |     public int getTotalPage() {
43 |         return totalPage;
44 |     }
45 | 
46 |     public Set<Book> getBookSet() {
47 |         return bookSet;
48 |     }
49 | }
50 | 


--------------------------------------------------------------------------------
/src/main/java/com/sslibrary/object/InfoReader.java:
--------------------------------------------------------------------------------
 1 | package com.sslibrary.object;
 2 | 
 3 | import com.sslibrary.spider.BookDownloader;
 4 | 
 5 | import java.io.IOException;
 6 | import java.nio.charset.StandardCharsets;
 7 | import java.nio.file.Files;
 8 | import java.nio.file.Paths;
 9 | import java.util.List;
10 | import java.util.regex.Matcher;
11 | import java.util.regex.Pattern;
12 | 
13 | /**
14 |  * info文件解析器。
15 |  * <p>
16 |  * info文件是由{@link com.sslibrary.spider.BookDownloader}在下载过程中创建的文本文件。
17 |  * 记录了一个{@link Book#toString()}
18 |  * 默认名称是{@link com.sslibrary.spider.BookDownloader#INFO_FILE_NAME}。
19 |  * 该类会读取info文件并解析出{@link Book}对象
20 |  *
21 |  * @author padeoe
22 |  * @Date: 2016/12/11
23 |  */
24 | public class InfoReader {
25 |     private String infoFilePath;
26 | 
27 |     public InfoReader(String infoFilePath) {
28 |         this.infoFilePath = infoFilePath;
29 |     }
30 | 
31 |     /**
32 |      * 解析{@code Book}对象，如果未找到返回null
33 |      *
34 |      * @return {@code Book}对象
35 |      */
36 |     public Book read() {
37 |         try {
38 |             List<String> lines = Files.readAllLines(Paths.get(infoFilePath), StandardCharsets.UTF_8);
39 |             String info = "";
40 |             if (lines.size() > 0) {
41 |                 info = lines.get(0);
42 |             }
43 |             Pattern pattern = Pattern.compile("Book\\{id='(.*)', name='(.*)', author='(.*)', publishDate='(.*)', theme='(.*)', bookClass=(.*), detailBookClass='(.*)'\\}");
44 |             Matcher matcher = pattern.matcher(info);
45 |             if (matcher.find()) {
46 |                 return new Book(matcher.group(1),
47 |                         matcher.group(2),
48 |                         matcher.group(3),
49 |                         matcher.group(4),
50 |                         matcher.group(5),
51 |                         new BookClass(matcher.group(6)),
52 |                         matcher.group(7));
53 |             }
54 |             return null;
55 |         } catch (IOException e) {
56 |             e.printStackTrace();
57 |             return null;
58 |         }
59 |     }
60 | }
61 | 


--------------------------------------------------------------------------------
/src/main/java/com/sslibrary/object/RootBookClass.java:
--------------------------------------------------------------------------------
 1 | package com.sslibrary.object;
 2 | 
 3 | /**
 4 |  * 根分类
 5 |  * <p>
 6 |  * 根分类是在中图法分类之外虚拟出的分类。
 7 |  * 用于集合管理所有子分类，以及作为起点，从服务器获取子分类。
 8 |  *
 9 |  * @author padeoe
10 |  * @Date: 2016/12/20
11 |  */
12 | public class RootBookClass extends BookClass {
13 |     public RootBookClass() {
14 |         super("all");
15 |     }
16 | 
17 |     /**
18 |      * 用于判断{@link BookClass}对象是不是{@link RootBookClass}的实例
19 |      *
20 |      * @return true
21 |      */
22 |     @Override
23 |     public boolean isRoot() {
24 |         return true;
25 |     }
26 | 
27 |     /**
28 |      * 用于判断{@link BookClass}对象是不是{@link TerminalBookClass}的实例
29 |      *
30 |      * @return false
31 |      */
32 |     @Override
33 |     public boolean isTerminal() {
34 |         return false;
35 |     }
36 | }
37 | 


--------------------------------------------------------------------------------
/src/main/java/com/sslibrary/object/TerminalBookClass.java:
--------------------------------------------------------------------------------
 1 | package com.sslibrary.object;
 2 | 
 3 | import java.util.HashSet;
 4 | import java.util.Set;
 5 | 
 6 | /**
 7 |  * 终端分类。即分类的最末层。
 8 |  * <p>
 9 |  * 采用的是中图法分类，例如"哲学宗教-哲学理论-辩证唯物主义-总论"的最后一个"总论"就是一个终端分类。
10 |  * 只有终端分类下可以存储图书。
11 |  *
12 |  * @author padeoe
13 |  * @Date: 2016/12/20
14 |  */
15 | public class TerminalBookClass extends BookClass {
16 |     private Set<Book> books = new HashSet<>();
17 | 
18 |     /**
19 |      * 创建一个新初始化的{@code BookClass}对象，
20 |      * 使之中图法分类标识是{@code id}
21 |      *
22 |      * @param id 分类的中图法分类标识。
23 |      *           需要和<a href="http://114.212.7.104:8181/markbook/">南京大学馆藏数字化图书平台</a>定义的格式一致
24 |      */
25 |     public TerminalBookClass(String id) {
26 |         super(id);
27 |     }
28 | 
29 | 
30 |     /**
31 |      * 构造函数。
32 |      *
33 |      * @param id     分类编号
34 |      * @param name   分类名
35 |      * @param parent 父分类
36 |      */
37 |     public TerminalBookClass(String id, String name, BookClass parent) {
38 |         super(id, name, parent);
39 |     }
40 | 
41 |     /**
42 |      * 获取分类下的书籍
43 |      * 该方法只是返回该分类下现有书籍，不会向服务器查询该分类下所有图书。
44 |      * 如需向服务器查询，请调用{@link BookClass#queryAllBooks()}及其重载方法
45 |      *
46 |      * @return 分类下的书籍。
47 |      */
48 |     public Set<Book> getBooks() {
49 |         return books;
50 |     }
51 | 
52 |     /**
53 |      * 用于判断{@link BookClass}对象是不是{@link TerminalBookClass}的实例
54 |      *
55 |      * @return true
56 |      */
57 |     @Override
58 |     public boolean isTerminal() {
59 |         return true;
60 |     }
61 | 
62 | 
63 |     /**
64 |      * 用于判断{@link BookClass}对象是不是{@link RootBookClass}的实例
65 |      *
66 |      * @return false
67 |      */
68 |     @Override
69 |     public boolean isRoot() {
70 |         return false;
71 |     }
72 | 
73 |     /**
74 |      * 增加分类下图书
75 |      *
76 |      * @param book 图书
77 |      * @return 如果分类下已有该图书，将返回false。如果没有，将添加并返回true
78 |      */
79 |     public boolean addBook(Book book) {
80 |         return books.add(book);
81 |     }
82 | }
83 | 


--------------------------------------------------------------------------------
/src/main/java/com/sslibrary/object/exception/BookDLException.java:
--------------------------------------------------------------------------------
 1 | package com.sslibrary.object.exception;
 2 | 
 3 | import com.sslibrary.object.Book;
 4 | 
 5 | /**
 6 |  * 下载某一本书时发生错误。此异常发生在该书对应的文件夹创建之前。因此此书没有任何文件被下载。
 7 |  *
 8 |  * @author padeoe
 9 |  *         Date: 2016/12/12
10 |  */
11 | public class BookDLException extends Exception {
12 |     /**
13 |      * 发生下载错误的书籍
14 |      */
15 |     private Book book;
16 | 
17 |     /**
18 |      * 创意一个初始化的{@code BookDLException}，并指定发生错误的书籍。
19 |      *
20 |      * @param book 发生下载错误的书籍
21 |      */
22 |     public BookDLException(Book book) {
23 |         this.book = book;
24 |     }
25 | 
26 |     /**
27 |      * 获取发生下载错误的书籍
28 |      *
29 |      * @return 发生下载错误的书籍
30 |      */
31 |     public Book getBook() {
32 |         return book;
33 |     }
34 | }
35 | 


--------------------------------------------------------------------------------
/src/main/java/com/sslibrary/object/exception/BookPagesDLException.java:
--------------------------------------------------------------------------------
 1 | package com.sslibrary.object.exception;
 2 | 
 3 | import java.util.Vector;
 4 | 
 5 | /**
 6 |  * 下载某一本书时发生错误。
 7 |  * <p>
 8 |  * 此异常发生在书本对应文件夹已经创建之后。
 9 |  * 包含了此书所有的书页下载错误{@code PageDLException}，用于错误恢复
10 |  *
11 |  * @author padeoe
12 |  *         Date: 2016/12/10
13 |  */
14 | public class BookPagesDLException extends Exception {
15 |     Vector<PageDLException> pageDLExceptions;
16 | 
17 |     /**
18 |      * 构造一个{@code BookPagesDLException},用此书所有的书页下载错误初始化
19 |      *
20 |      * @param pageDLExceptionList 此书所有的书页下载错误
21 |      */
22 |     public BookPagesDLException(Vector<PageDLException> pageDLExceptionList) {
23 |         this.pageDLExceptions = pageDLExceptionList;
24 |     }
25 | 
26 |     /**
27 |      * 获取页错误的集合
28 |      *
29 |      * @return 此书所有的书页下载错误{@code PageDLException}
30 |      */
31 |     public Vector<PageDLException> getPageDLExceptions() {
32 |         return pageDLExceptions;
33 |     }
34 | }
35 | 


--------------------------------------------------------------------------------
/src/main/java/com/sslibrary/object/exception/PageDLException.java:
--------------------------------------------------------------------------------
 1 | package com.sslibrary.object.exception;
 2 | 
 3 | /**
 4 |  * 下载图书的某一页时失败。
 5 |  * <p>
 6 |  * 该类包含了错误现场的信息，可用于错误恢复与后期处理
 7 |  *
 8 |  * @author padeoe
 9 |  *         Date: 2016/12/10
10 |  */
11 | public class PageDLException extends Exception {
12 |     private String url;
13 |     private String location;
14 | 
15 |     /**
16 |      * 创建并初始化一个{@code PageDLException}对象。指定下载地址和存储地址。
17 |      *
18 |      * @param url      出错页图片的网络地址
19 |      * @param location 出错页图片本应存储的本地路径。不含图片后缀名
20 |      */
21 |     public PageDLException(String url, String location) {
22 |         super();
23 |         this.url = url;
24 |         this.location = location;
25 |     }
26 | 
27 |     /**
28 |      * 获取出错页的URL
29 |      *
30 |      * @return 出错页的URL
31 |      */
32 |     public String getUrl() {
33 |         return url;
34 |     }
35 | 
36 |     /**
37 |      * 获取出错页图片本应存储的本地路径。
38 |      *
39 |      * @return 出错页图片本应存储的本地路径。不含图片后缀名
40 |      */
41 |     public String getLocation() {
42 |         return location;
43 |     }
44 | 
45 |     @Override
46 |     public String toString() {
47 |         return "PageDLException{" +
48 |                 "url='" + url + '\'' +
49 |                 ", location='" + location + '\'' +
50 |                 '}';
51 |     }
52 | }
53 | 


--------------------------------------------------------------------------------
/src/main/java/com/sslibrary/spider/BookSearch.java:
--------------------------------------------------------------------------------
  1 | package com.sslibrary.spider;
  2 | 
  3 | import com.sslibrary.object.Book;
  4 | import com.sslibrary.object.Books;
  5 | import com.sslibrary.object.RootBookClass;
  6 | import org.jsoup.Jsoup;
  7 | import org.jsoup.nodes.Document;
  8 | import org.jsoup.select.Elements;
  9 | import utils.network.MyHttpRequest;
 10 | 
 11 | import java.io.IOException;
 12 | import java.net.URLEncoder;
 13 | import java.util.HashMap;
 14 | import java.util.Map;
 15 | import java.util.Set;
 16 | 
 17 | /**
 18 |  * 查询符合条件的书籍。
 19 |  * <p>
 20 |  * 从<a href="http://114.212.7.104:8181/markbook/">南京大学馆藏数字化图书平台</a>查询符合条件的书籍。
 21 |  * 可通过书名或者sql语句查询书籍。
 22 |  * 可以在查询过程中动态创建图书的分类目录结构。
 23 |  *
 24 |  * @author padeoe
 25 |  * @Date: 2016/12/09
 26 |  */
 27 | public class BookSearch {
 28 |     String cookie;
 29 | 
 30 |     /**
 31 |      * 查询
 32 |      *
 33 |      * @throws IOException 查询失败
 34 |      */
 35 |     public BookSearch() throws IOException {
 36 |         this.cookie = NJULib.getSession();
 37 |     }
 38 | 
 39 |     /**
 40 |      * 通过指定sql查询的where子句进行图书查询
 41 |      *
 42 |      * @param sqlWhereClause 一些已知字段包括"书名","主题词","出版日期","作者"
 43 |      * @param page           查询结果列表的页码
 44 |      * @param rootBookClass  查询到的书本将会添加进该分类结构
 45 |      * @return 查询结果，包含查询到的书本列表，书本总数量和结果总页数
 46 |      * @throws IOException 查询失败
 47 |      */
 48 |     public Books searchBySQL(String sqlWhereClause, int page, RootBookClass rootBookClass) throws IOException {
 49 |         String url = NJULib.baseUrl + "/markbook/BookSearch.jsp";
 50 |         String data = "Page=" + page + "&MethodType=1" + "&Library=&KeyName=0&Condition=" + URLEncoder.encode(sqlWhereClause, "UTF-8") + "&Sort=&links=0&PSize=10&_=";
 51 |         Map<String, String> requestProperty = new HashMap<>();
 52 |         requestProperty.put("Content-type", "application/x-www-form-urlencoded; charset=UTF-8");
 53 |         String result = MyHttpRequest.postWithCookie(data, url, requestProperty, cookie, "UTF-8", "GBK", 2000);
 54 |         int totalNums = 0, totalPage = 0;
 55 |         Document doc = Jsoup.parse(result);
 56 |         Elements totalNumsNode = doc.select("input[name=TotalNums]");
 57 |         if (totalNumsNode != null && totalNumsNode.size() > 0) {
 58 |             totalNums = Integer.parseInt(totalNumsNode.get(0).attr("value"));
 59 |         }
 60 |         Elements totalPageNode = doc.select("a[href]:contains(末页)");
 61 |         if (totalPageNode != null && totalPageNode.size() > 0) {
 62 |             String href = totalPageNode.get(0).attr("href");
 63 |             int start = href.indexOf('(') + 1;
 64 |             int end = href.indexOf(')');
 65 |             if (start != 0 && end != -1) {
 66 |                 totalPage = Integer.parseInt(href.substring(start, end));
 67 |             }
 68 |         }
 69 |         Set<Book> books = rootBookClass.queryBooks(result);
 70 |         return new Books(page, totalPage, totalNums, books);
 71 |     }
 72 | 
 73 |     /**
 74 |      * 通过指定sql查询的where子句进行图书查询
 75 |      *
 76 |      * @param sqlWhereClause where子句，一些已知字段包括"书名","主题词","出版日期","作者"
 77 |      * @param page           查询结果列表的页码
 78 |      * @return 如果没有匹配结果，返回空的对象
 79 |      * @throws IOException 查询失败
 80 |      */
 81 |     public Books searchBySQL(String sqlWhereClause, int page) throws IOException {
 82 |         return searchBySQL(sqlWhereClause, page, new RootBookClass());
 83 |     }
 84 | 
 85 |     /**
 86 |      * 通过指定sql查询的where子句进行图书查询，只返回第一页结果。
 87 |      *
 88 |      * @param sqlWhereClause where子句，一些已知字段包括"书名","主题词","出版日期","作者"
 89 |      * @return 如果没有匹配结果，返回空的对象
 90 |      * @throws IOException 查询失败
 91 |      */
 92 |     public Books searchBySQL(String sqlWhereClause) throws IOException {
 93 |         return searchBySQL(sqlWhereClause, 1);
 94 |     }
 95 | 
 96 |     /**
 97 |      * 通过指定sql查询的where子句进行图书查询
 98 |      *
 99 |      * @param sqlWhereClause where子句，一些已知字段包括"书名","主题词","出版日期","作者"
100 |      * @return 查询结果，书的集合
101 |      * @throws IOException 查询失败
102 |      */
103 |     public Set<Book> findAllBySQL(String sqlWhereClause) throws IOException {
104 |         Set<Book> bookSet = null;
105 |         Books firstPageBooks = searchBySQL(sqlWhereClause, 1);
106 |         bookSet = firstPageBooks.getBookSet();
107 |         for (int i = 2; i <= firstPageBooks.getTotalPage(); i++) {
108 |             bookSet.addAll(searchBySQL(sqlWhereClause, i).getBookSet());
109 |         }
110 |         return bookSet;
111 |     }
112 | 
113 |     /**
114 |      * 通过指定sql查询的where子句进行图书查询,并把查询结果中的图书添加进分类结构
115 |      *
116 |      * @param sqlWhereClause where子句，一些已知字段包括"书名","主题词","出版日期","作者"
117 |      * @param rootBookClass  根分类
118 |      * @return 查询结果，书本集合
119 |      * @throws IOException 查询失败
120 |      */
121 |     public Set<Book> findAllBySQL(String sqlWhereClause, RootBookClass rootBookClass) throws IOException {
122 | 
123 |         Books firstPageBooks = searchBySQL(sqlWhereClause, 1, rootBookClass);
124 |         Set<Book> bookSet = firstPageBooks.getBookSet();
125 |         for (int i = 2; i <= firstPageBooks.getTotalPage(); i++) {
126 |             bookSet.addAll(searchBySQL(sqlWhereClause, i, rootBookClass).getBookSet());
127 |         }
128 |         return bookSet;
129 |     }
130 | 
131 |     private Books searchByName(String name) throws IOException {
132 |         return searchBySQL("书名 like '%" + name + "%' ");
133 |     }
134 | }
135 | 


--------------------------------------------------------------------------------
/src/main/java/com/sslibrary/spider/NJULib.java:
--------------------------------------------------------------------------------
 1 | package com.sslibrary.spider;
 2 | 
 3 | import utils.network.MyHttpRequest;
 4 | 
 5 | import java.io.IOException;
 6 | 
 7 | /**
 8 |  * 用于获取Session
 9 |  *
10 |  * @author padeoe
11 |  *         Date: 2016/12/08
12 |  */
13 | public class NJULib {
14 |     public static final String baseUrl = "http://114.212.7.104:8181";
15 | 
16 |     /**
17 |      * 获取SeesionId
18 |      *
19 |      * @return SeesionId
20 |      * @throws IOException 出现网络错误
21 |      */
22 |     public static String getSession() throws IOException {
23 |         System.out.println("正在重置cookie");
24 |         String Url = baseUrl + "/markbook/";
25 |         return MyHttpRequest.getAndGetCookie(Url, null, "UTF-8", 1000)[1];
26 |     }
27 | 
28 | }
29 | 


--------------------------------------------------------------------------------
/src/main/java/com/sslibrary/spider/PDFGenerator.java:
--------------------------------------------------------------------------------
 1 | package com.sslibrary.spider;
 2 | 
 3 | import cn.chineseall.Node;
 4 | import com.itextpdf.kernel.pdf.*;
 5 | import com.itextpdf.kernel.pdf.action.PdfAction;
 6 | import com.itextpdf.kernel.pdf.navigation.PdfExplicitDestination;
 7 | import com.sslibrary.object.Book;
 8 | 
 9 | import java.io.*;
10 | import java.util.List;
11 | 
12 | /**
13 |  * Created by padeoe on 2017/4/24.
14 |  */
15 | public class PDFGenerator {
16 |     static String converComamndLocation="D:\\ImageMagick-7.0.5-Q16\\convert.exe";
17 |     File sourceDir;
18 |     File outputDir;
19 |     Book book;
20 | 
21 |     public PDFGenerator(File sourceDir, File outputDir, Book book) {
22 |         this.sourceDir = sourceDir;
23 |         this.outputDir = outputDir;
24 |         this.book = book;
25 |     }
26 | 
27 |     public void make(){
28 |         String outputPath=outputDir.toPath().resolve(sourceDir.getName()).toString();
29 |         String[] commands = new String[]{converComamndLocation, "-density","300","-units","PixelsPerInch",sourceDir.getPath()+System.getProperty("file.separator")+"*p*", outputPath+"-tmp.pdf"};
30 |         Runtime runtime = Runtime.getRuntime();
31 |         Process process;
32 |         System.out.println(book.getName()+"开始合成pdf");
33 |         try {
34 |             process = runtime.exec(commands);
35 |             InputStream is = process.getErrorStream();
36 |             InputStreamReader isr = new InputStreamReader(is);
37 |             BufferedReader bf = new BufferedReader(isr);
38 |             String line;
39 |             while ((line = bf.readLine()) != null) {
40 |                 System.out.println(line);
41 |             }
42 |             process.waitFor();
43 |             addBookMark(book,outputPath+"-tmp.pdf",outputPath+".pdf");
44 |         } catch (IOException e) {
45 |             e.printStackTrace();
46 |         } catch (InterruptedException e) {
47 |             e.printStackTrace();
48 |         }
49 |         addBookMark(book,outputPath+"-tmp.pdf",outputPath+".pdf");
50 | 
51 |     }
52 | 
53 |     public static void addBookMark(Book book,String src,String dest){
54 |         PdfDocument pdfDoc = null;
55 |         try {
56 |             pdfDoc = new PdfDocument(new PdfReader(src), new PdfWriter(dest));
57 |             PdfOutline root = pdfDoc.getOutlines(false);
58 |             PdfDocumentInfo info=pdfDoc.getDocumentInfo();
59 |             info.setTitle(book.getName());
60 |             info.setAuthor(book.getAuthor());
61 |             List<Node> nodes = book.getOutline();
62 |             addOutline(nodes, root, pdfDoc);
63 |             pdfDoc.close();
64 |         } catch (IOException e) {
65 |             e.printStackTrace();
66 |         }
67 |     }
68 | 
69 | 
70 |     private static void addOutline(List<Node> nodes, PdfOutline root, PdfDocument pdfDocument) {
71 |         for (Node node : nodes) {
72 |             PdfOutline child = root.addOutline(node.getTitle());
73 |             child.addAction(PdfAction.createGoTo(
74 |                     PdfExplicitDestination.createFitH(pdfDocument.getPage(node.getPage()),
75 |                             pdfDocument.getPage(node.getPage()).getPageSize().getTop())));
76 |             addOutline(node.getChildren(), child, pdfDocument);
77 | 
78 |         }
79 |     }
80 | 
81 | 
82 | }
83 | 


--------------------------------------------------------------------------------
/src/main/java/utils/ImageMeger.java:
--------------------------------------------------------------------------------
 1 | package utils;
 2 | 
 3 | import com.njulib.fix.ListBook;
 4 | import com.njulib.object.InfoReader;
 5 | import com.njulib.spider.BookDownloader;
 6 | import utils.conversion.PDFTool;
 7 | 
 8 | import java.io.File;
 9 | import java.io.FileNotFoundException;
10 | import java.net.MalformedURLException;
11 | import java.nio.file.Path;
12 | import java.nio.file.Paths;
13 | import java.util.Arrays;
14 | import java.util.List;
15 | import java.util.stream.Collectors;
16 | 
17 | /**
18 |  * Created by padeoe on 2017/4/27.
19 |  */
20 | public class ImageMeger {
21 |     private String rootDir;
22 |     private String outputDir;
23 | 
24 |     public static void main(String[] args) {
25 |       //  args=new String[]{"G:\\Book","G:\\BookPDF"};
26 |         ImageMeger imageMeger = new ImageMeger(args[0], args[1]);
27 |         imageMeger.start();
28 |     }
29 | 
30 |     public void start() {
31 |         final int[] i = {1};
32 |         ListBook.getAllBooksAndDir(new File(rootDir)).filter(
33 |                 bookAndDir-> !(Paths.get(outputDir,bookAndDir.getDir().getName()+".pdf").toFile().exists())
34 |         ).forEach(
35 |                 bookAndDir -> {
36 |                     try {
37 |                         PDFTool.generatePDFFromImage(
38 |                                 Arrays.stream(
39 |                                         bookAndDir.getDir().listFiles()).filter(
40 |                                         file -> file.getName().endsWith(".png") || file.getName().endsWith("jpg")
41 |                                 ).toArray(File[]::new),
42 |                                 Paths.get(outputDir, bookAndDir.getDir().getName() + ".pdf").toFile(),
43 |                                 bookAndDir.getBook()
44 |                         );
45 |                         String bookName=bookAndDir.getBook().getName();
46 |                         String output="\r"+i[0] + " "+bookName;
47 |                         StringBuffer spaces=new StringBuffer();
48 |                         for(int k=0;k<80-output.length();k++){
49 |                             spaces.append(" ");
50 |                         }
51 |                         System.out.print(output+spaces.toString());
52 |                         i[0]++;
53 | 
54 |                            BookDownloader.deleteDir(bookAndDir.getDir());
55 | 
56 |                     } catch (Exception e) {
57 |                         e.printStackTrace();
58 |                     }
59 |                 }
60 |         );
61 |     }
62 | 
63 |     public ImageMeger(String rootDir, String outputDir) {
64 |         this.rootDir = rootDir;
65 |         this.outputDir = outputDir;
66 |     }
67 | 
68 |     public String getRootDir() {
69 |         return rootDir;
70 |     }
71 | 
72 |     public void setRootDir(String rootDir) {
73 |         this.rootDir = rootDir;
74 |     }
75 | 
76 |     public String getOutputDir() {
77 |         return outputDir;
78 |     }
79 | 
80 |     public void setOutputDir(String outputDir) {
81 |         this.outputDir = outputDir;
82 |     }
83 | }
84 | 


--------------------------------------------------------------------------------
/src/main/java/utils/conversion/MyDecoder.java:
--------------------------------------------------------------------------------
 1 | package utils.conversion;
 2 | 
 3 | import java.util.regex.Matcher;
 4 | import java.util.regex.Pattern;
 5 | 
 6 | /**
 7 |  * @author padeoe
 8 |  * @Date 2016/12/21
 9 |  */
10 | public class MyDecoder {
11 |     /**
12 |      * 将url编码的unicode转成utf-8编码的字符串
13 |      *
14 |      * @param input 类似"%u7ecf%u5178%u7406%u8bba"的格式
15 |      * @return 解码的字符串
16 |      */
17 |     public static String decodeUrlUnicode(String input) {
18 |         Pattern pattern = Pattern.compile("%u?([A-Za-z0-9]{2,4})");
19 |         StringBuilder builder = new StringBuilder();
20 |         Matcher matcher = pattern.matcher(input);
21 |         while (matcher.find()) {
22 |             builder.append((char) Integer.parseInt(matcher.group(1), 16));
23 |         }
24 |         return builder.toString();
25 |     }
26 | }
27 | 


--------------------------------------------------------------------------------
/src/main/java/utils/conversion/PDFMerge.java:
--------------------------------------------------------------------------------
 1 | package utils.conversion;
 2 | 
 3 | import com.itextpdf.kernel.pdf.PdfDocument;
 4 | import com.itextpdf.kernel.pdf.PdfReader;
 5 | import com.itextpdf.kernel.pdf.PdfWriter;
 6 | import org.apache.pdfbox.io.MemoryUsageSetting;
 7 | import org.apache.pdfbox.multipdf.PDFMergerUtility;
 8 | 
 9 | import java.io.File;
10 | import java.io.IOException;
11 | import java.nio.file.Path;
12 | 
13 | public class PDFMerge {
14 | 
15 |     public static void mergePDFs(File[] pdfs, Path outFilePath) throws IOException {
16 |         PDFMergerUtility PDFmerger = new PDFMergerUtility();
17 |         PDFmerger.setDestinationFileName(outFilePath.toString());
18 |         for (File file : pdfs) PDFmerger.addSource(file);
19 |         PDFmerger.mergeDocuments(MemoryUsageSetting.setupMixed(1024 * 1024 * 500));
20 |     }
21 | 
22 |     public static void compressPDF(Path originPDF, Path outfilePath) throws IOException {
23 |         PdfReader pdfReader = new PdfReader(originPDF.toString());
24 |         PdfDocument inputPdfDoc = new PdfDocument(pdfReader);
25 |         File outputPDF = new File(outfilePath.toString());
26 |         PdfDocument outPdfDoc = new PdfDocument(new PdfWriter(outputPDF.getPath()
27 |         ).setSmartMode(true));
28 | 
29 |         int size = inputPdfDoc.getNumberOfPages();
30 |         inputPdfDoc.copyPagesTo(1, size, outPdfDoc);
31 |         outPdfDoc.close();
32 |         inputPdfDoc.close();
33 |     }
34 | 
35 | 
36 | }
37 | 


--------------------------------------------------------------------------------
/src/main/java/utils/conversion/PDFTool.java:
--------------------------------------------------------------------------------
  1 | package utils.conversion;
  2 | 
  3 | import com.itextpdf.io.image.ImageDataFactory;
  4 | import com.itextpdf.kernel.events.Event;
  5 | import com.itextpdf.kernel.events.IEventHandler;
  6 | import com.itextpdf.kernel.events.PdfDocumentEvent;
  7 | import com.itextpdf.kernel.geom.PageSize;
  8 | import com.itextpdf.kernel.geom.Rectangle;
  9 | import com.itextpdf.kernel.pdf.PdfDocument;
 10 | import com.itextpdf.kernel.pdf.PdfDocumentInfo;
 11 | import com.itextpdf.kernel.pdf.PdfPage;
 12 | import com.itextpdf.kernel.pdf.PdfWriter;
 13 | import com.itextpdf.kernel.pdf.canvas.PdfCanvas;
 14 | import com.itextpdf.layout.Canvas;
 15 | import com.itextpdf.layout.element.Image;
 16 | import com.njulib.object.Book;
 17 | 
 18 | import java.io.*;
 19 | import java.net.MalformedURLException;
 20 | import java.util.*;
 21 | import java.util.stream.Collectors;
 22 | 
 23 | /**
 24 |  * 用于处理前期图片产物，压制成pdf，并给pdf添加书本信息
 25 |  * Created by padeoe on 2017/4/26.
 26 |  */
 27 | public class PDFTool {
 28 | 
 29 |     /**
 30 |      * 将图片合成为一个PDF
 31 |      * @param inputImage 图片，格式为图片格式
 32 |      * @param outputPDF 输出文件
 33 |      * @throws FileNotFoundException
 34 |      * @throws MalformedURLException
 35 |      */
 36 |     public static void generatePDFFromImage(File[] inputImage,File outputPDF) throws FileNotFoundException, MalformedURLException {
 37 |         List<Image>images=new LinkedList<>();
 38 |         for(File file:inputImage){
 39 |             images.add(new Image(ImageDataFactory.create(file.getPath())));
 40 |         }
 41 |         PdfDocument pdfDoc = new PdfDocument(new PdfWriter(outputPDF.getPath()));
 42 | 
 43 |         images.forEach(image -> pdfDoc.addNewPage(new PageSize(new Rectangle(image.getImageScaledWidth(), image.getImageScaledHeight()))));
 44 |         BackgroundEventHandler handler = new BackgroundEventHandler(images);
 45 |         pdfDoc.addEventHandler(PdfDocumentEvent.END_PAGE, handler);
 46 |         pdfDoc.close();
 47 |     }
 48 | 
 49 |     public static void generatePDFFromImage(File[] inputImage,File outputPDF,Book book) throws FileNotFoundException{
 50 |         List<File> sorted = Arrays.asList(inputImage);
 51 |         Collections.sort(sorted, Comparator.comparing(File::getName));
 52 |         inputImage=sorted.toArray(new File[]{});
 53 |         List<Image>images=new LinkedList<>();
 54 |         boolean hasException=false;
 55 |         for(File file:inputImage){
 56 |             try {
 57 |                 images.add(new Image(ImageDataFactory.create(file.getPath())));
 58 |             } catch (MalformedURLException e) {
 59 |                 System.err.println(file.getPath());
 60 |                 e.printStackTrace();
 61 |             } catch (com.itextpdf.io.IOException eee){
 62 |                 hasException=true;
 63 |             }
 64 |         }
 65 |         PdfDocument pdfDoc = new PdfDocument(new PdfWriter(outputPDF.getPath()));
 66 |         PdfDocumentInfo info=pdfDoc.getDocumentInfo();
 67 |         if(book.getName()!=null&&!book.getName().equals("null")){
 68 |             info.setTitle(book.getName());
 69 |         }
 70 |         if(book.getAuthor()!=null&&!book.getAuthor().equals("null")){
 71 |             info.setAuthor(book.getAuthor());
 72 |         }
 73 |         if(book.getTheme()!=null&&!book.getTheme().equals("null")){
 74 |             info.setSubject(book.getTheme());
 75 |         }
 76 |         StringBuffer keyword=new StringBuffer();
 77 |         if(book.getPublishDate()!=null&&!book.getPublishDate().equals("null")){
 78 |             keyword.append("出版时间:"+book.getPublishDate()+"\n");
 79 |         }
 80 |         if(book.getBookClass()!=null&&!book.getBookClass().equals("null")){
 81 |             keyword.append("分类:"+book.getDetailBookClass().replaceAll("图书馆",""));
 82 |         }
 83 |         info.setKeywords(keyword.toString());
 84 |         if(hasException){
 85 |             System.err.println(book.getName()+" 图片格式异常");
 86 |             info.setCreator("exception");
 87 |         }
 88 | 
 89 |         images.forEach(image -> pdfDoc.addNewPage(new PageSize(new Rectangle(image.getImageScaledWidth(), image.getImageScaledHeight()))));
 90 |         BackgroundEventHandler handler = new BackgroundEventHandler(images);
 91 |         pdfDoc.addEventHandler(PdfDocumentEvent.END_PAGE, handler);
 92 |         pdfDoc.close();
 93 |     }
 94 | 
 95 |     private static class BackgroundEventHandler implements IEventHandler {
 96 |         protected List<Image> images;
 97 |         protected int offset=0;
 98 | 
 99 |         public BackgroundEventHandler(List<Image> images) {
100 |             this.images = images;
101 |         }
102 |         @Override
103 |         public void handleEvent(Event event) {
104 |             PdfDocumentEvent docEvent = (PdfDocumentEvent) event;
105 |             PdfDocument pdfDoc = docEvent.getDocument();
106 |             PdfPage page = docEvent.getPage();
107 |             PdfCanvas canvas = new PdfCanvas(page.newContentStreamBefore(),
108 |                     page.getResources(), pdfDoc);
109 |             Rectangle area = page.getPageSize();
110 |             new Canvas(canvas, pdfDoc, area)
111 |                     .add(images.get(offset));
112 |             offset++;
113 |         }
114 |     }
115 | }
116 | 


--------------------------------------------------------------------------------
/src/main/java/utils/network/MyByteArray.java:
--------------------------------------------------------------------------------
 1 | package utils.network;
 2 | 
 3 | /**
 4 |  * @author Nifury
 5 |  *         Date: 2015/12/17
 6 |  */
 7 | public class MyByteArray {
 8 |     private byte[] buffer = new byte[4096];
 9 |     private int position = 0;
10 | 
11 |     public void ensureCapacity(int capacity) {
12 |         if (buffer.length - position < capacity) {
13 |             byte[] tmp = new byte[Math.max(buffer.length * 2, buffer.length + capacity)];
14 |             System.arraycopy(buffer, 0, tmp, 0, position);
15 |             buffer = tmp;
16 |         }
17 |     }
18 | 
19 |     public void addOffset(int delta) {
20 |         position += delta;
21 |     }
22 | 
23 |     public byte[] getBuffer() {
24 |         return buffer;
25 |     }
26 | 
27 |     public int getOffset() {
28 |         return position;
29 |     }
30 | 
31 |     public int getSize() {
32 |         return position;
33 |     }
34 | 
35 |     public static void main(String[] args) {
36 |         MyByteArray array = new MyByteArray();
37 |     }
38 | }
39 | 


--------------------------------------------------------------------------------
/src/main/java/utils/network/MyHttpRequest.java:
--------------------------------------------------------------------------------
  1 | package utils.network;
  2 | 
  3 | import java.io.IOException;
  4 | import java.io.InputStream;
  5 | import java.io.OutputStream;
  6 | import java.io.UnsupportedEncodingException;
  7 | import java.net.*;
  8 | import java.util.Arrays;
  9 | import java.util.HashMap;
 10 | import java.util.List;
 11 | import java.util.Map;
 12 | 
 13 | /**
 14 |  * 该类用于负责http网络请求，包含get，set等方法
 15 |  *
 16 |  * @author padeoe, Nifury
 17 |  *         Date: 2016/12/09
 18 |  */
 19 | public class MyHttpRequest {
 20 |     public static String[] action(String action, String data, String URL, Map<String, String> requestProperty, String cookie, String inputEncoding, String outputEncoding, int timeout) throws IOException {
 21 |         ReturnData returnData = action_returnbyte(action, data, URL, requestProperty, cookie, inputEncoding, timeout);
 22 |         String result = null;
 23 |         if (returnData.data != null) {
 24 |             result = new String(returnData.data, 0, returnData.data.length, outputEncoding);
 25 |         }
 26 |         List<String> cookies = returnData.getHeaders().get("Set-Cookie");
 27 |         if (cookies != null && cookies.get(0) != null) {
 28 |             return new String[]{result, cookies.get(0)};
 29 |         }
 30 |         return new String[]{result};
 31 |     }
 32 | 
 33 |     /**
 34 |      * POST请求
 35 |      *
 36 |      * @param action          post或get请求
 37 |      * @param data            数据
 38 |      * @param URL             服务器地址
 39 |      * @param requestProperty 请求头
 40 |      * @param cookie          cookie若无则置为空
 41 |      * @param inputEncoding   请求编码
 42 |      * @param timeout         超时时间
 43 |      * @return 字符串数组，第一个元素是响应数据,若长度为2则第二个是返回的cookie
 44 |      * @throws IOException 网络错误
 45 |      */
 46 |     public static ReturnData action_returnbyte(String action, String data, String URL, Map<String, String> requestProperty, String cookie, String inputEncoding, int timeout) throws IOException {
 47 |         byte[] dataAsBytes = new byte[]{};
 48 |         if (data != null) {
 49 |             dataAsBytes = data.getBytes(inputEncoding);
 50 |         }
 51 |         java.net.URL url = new URL(URL);
 52 |         HttpURLConnection connection = (HttpURLConnection) url
 53 |                 .openConnection(/*new Proxy(Proxy.Type.SOCKS, new InetSocketAddress("127.0.0.1", 1080))*/);
 54 |         connection.setConnectTimeout(timeout);
 55 |         connection.setRequestMethod(action);
 56 |         if (action.toLowerCase().equals("post")) {
 57 |             connection.setDoOutput(true);
 58 |         }
 59 |         //  connection.setUseCaches(false);
 60 |            /*           java 1.6 does not support
 61 |            requestProperty.forEach((k,v) -> connection.setRequestProperty(k, v));
 62 |            */
 63 |         if (requestProperty != null) {
 64 |             for (Map.Entry<String, String> entry : requestProperty.entrySet()) {
 65 |                 connection.setRequestProperty(entry.getKey(), entry.getValue());
 66 |             }
 67 |         }
 68 |         if (data != null) {
 69 |             connection.setRequestProperty("Content-Length", String.valueOf(dataAsBytes.length));
 70 |         }
 71 | 
 72 |         if (cookie != null) {
 73 |             connection.setRequestProperty("Cookie", cookie);
 74 |         }
 75 |         connection.connect();
 76 | 
 77 |            /*          java 1.6 do not support
 78 |             try (OutputStream outputStream = connection.getOutputStream()) {
 79 |                 outputStream.write(dataAsBytes);
 80 |             }*/
 81 |         if (data != null) {
 82 |             OutputStream outputStream = null;
 83 |             try {
 84 |                 outputStream = connection.getOutputStream();
 85 |                 outputStream.write(dataAsBytes);
 86 |             } finally {
 87 |                 if (outputStream != null) {
 88 |                     outputStream.close();
 89 |                 }
 90 | 
 91 |             }
 92 |         }
 93 | 
 94 |         //读取返回数据
 95 |         utils.network.MyByteArray myByteArray = new utils.network.MyByteArray();
 96 | /*          java 1.6 do not support
 97 |             try (InputStream inputStream = connection.getInputStream()) {
 98 |                 len = inputStream.read(readData);
 99 |             }*/
100 | 
101 | 
102 |         InputStream inputStream = null;
103 |         Map<String, List<String>> headers;
104 | /*        if(connection.getURL().toString().indexOf(";")!=-1){
105 |             headers=new HashMap<>();
106 |             headers.put("Set-Cookie", Arrays.asList(connection.getURL().toString().split(";")[1]));
107 |             connection.disconnect();
108 |             return new ReturnData(null, headers);
109 |         }*/
110 | 
111 |         try {
112 |             inputStream = connection.getInputStream();
113 |             while (true) {
114 |                 myByteArray.ensureCapacity(4096);
115 |                 int len = inputStream.read(myByteArray.getBuffer(), myByteArray.getOffset(), 4096);
116 |                 if (len == -1) {
117 |                     break;
118 |                 }
119 |                 myByteArray.addOffset(len);
120 |             }
121 | 
122 |         } finally {
123 |             if (inputStream != null) {
124 |                 {
125 |                     inputStream.close();
126 |                 }
127 |             }
128 |         }
129 |         headers = connection.getHeaderFields();
130 | 
131 |         connection.disconnect();
132 |         byte[] bytes = new byte[myByteArray.getSize()];
133 |         System.arraycopy(myByteArray.getBuffer(),0,bytes,0,bytes.length);
134 |         return new ReturnData(bytes, headers);
135 |     }
136 | 
137 |     /**
138 |      * 获得cookie的POST请求
139 |      *
140 |      * @param postData        请求数据
141 |      * @param URL             服务器地址
142 |      * @param requestProperty 请求头
143 |      * @param inputEncoding   请求编码
144 |      * @param outputEncoding  响应编码
145 |      * @param timeout         超时时间
146 |      * @return 字符串数组，第一个元素是响应数据,第二个是返回的cookie
147 |      * @throws IOException 网络错误
148 |      */
149 |     public static String[] postAndGetCookie(String postData, String URL, Map<String, String> requestProperty, String inputEncoding, String outputEncoding, int timeout) throws IOException {
150 |         return action("POST", postData, URL, requestProperty, null, inputEncoding, outputEncoding, timeout);
151 |     }
152 | 
153 |     /**
154 |      * 发送cookie的POST请求
155 |      *
156 |      * @param postData        请求数据
157 |      * @param URL             服务器地址
158 |      * @param requestProperty 请求头
159 |      * @param cookie          发送的cookie
160 |      * @param inputEncoding   请求编码
161 |      * @param outputEncoding  响应编码
162 |      * @param timeout         超时时间
163 |      * @return 响应数据
164 |      * @throws IOException 网络错误
165 |      */
166 |     public static String postWithCookie(String postData, String URL, Map<String, String> requestProperty, String cookie, String inputEncoding, String outputEncoding, int timeout) throws IOException {
167 |         return action("POST", postData, URL, requestProperty, cookie, inputEncoding, outputEncoding, timeout)[0];
168 |     }
169 | 
170 |     /**
171 |      * POST请求(不含cookie)
172 |      *
173 |      * @param postData        请求数据
174 |      * @param URL             服务器地址
175 |      * @param requestProperty 请求头
176 |      * @param inputEncoding   请求编码
177 |      * @param outputEncoding  响应编码
178 |      * @param timeout         超时时间
179 |      * @return 响应数据
180 |      * @throws IOException 网络错误
181 |      */
182 |     public static String post(String postData, String URL, Map<String, String> requestProperty, String inputEncoding, String outputEncoding, int timeout) throws IOException {
183 |         return action("POST", postData, URL, requestProperty, null, inputEncoding, outputEncoding, timeout)[0];
184 |     }
185 | 
186 | 
187 |     /**
188 |      * 获得cookie的Get请求
189 |      *
190 |      * @param URL             服务器地址
191 |      * @param requestProperty 请求头
192 |      * @param outputEncoding  响应编码
193 |      * @param timeout         超时时间
194 |      * @return 字符串数组，第一个元素是响应数据,第二个是返回的cookie
195 |      * @throws IOException 网络错误
196 |      */
197 |     public static String[] getAndGetCookie(String URL, Map<String, String> requestProperty, String outputEncoding, int timeout) throws IOException {
198 |         return action("GET", null, URL, requestProperty, null, "null", outputEncoding, timeout);
199 |     }
200 | 
201 |     /**
202 |      * 需要cookie的Get请求
203 |      *
204 |      * @param URL             服务器地址
205 |      * @param requestProperty 请求头
206 |      * @param cookie          发送的cookie
207 |      * @param outputEncoding  响应编码
208 |      * @param timeout         超时时间
209 |      * @return 响应数据
210 |      * @throws IOException 网络错误
211 |      */
212 |     public static String getWithCookie(String URL, Map<String, String> requestProperty, String cookie, String outputEncoding, int timeout) throws IOException {
213 |         return action("GET", null, URL, requestProperty, cookie, null, outputEncoding, timeout)[0];
214 |     }
215 | 
216 |     /**
217 |      * POST请求(不含cookie)
218 |      *
219 |      * @param URL             服务器地址
220 |      * @param requestProperty 请求头
221 |      * @param outputEncoding  响应编码
222 |      * @param timeout         超时时间
223 |      * @return 响应数据
224 |      * @throws IOException 网络错误
225 |      */
226 |     public static String get(String URL, Map<String, String> requestProperty, String outputEncoding, int timeout) throws IOException {
227 |         return action("GET", null, URL, requestProperty, null, null, outputEncoding, timeout)[0];
228 |     }
229 | 
230 |     public static int getReturnCode(String action, String postData, String URL, Map<String, String> requestProperty, String inputEncoding, String outputEncoding, int timeout) {
231 |         try {
232 |             byte[] postAsBytes = new byte[]{};
233 |             if (postData != null) {
234 |                 postAsBytes = postData.getBytes(inputEncoding);
235 |             }
236 |             java.net.URL url = new URL(URL);
237 |             HttpURLConnection connection = (HttpURLConnection) url
238 |                     .openConnection();
239 |             connection.setConnectTimeout(timeout);
240 |             connection.setDoOutput(true);
241 |             connection.setRequestMethod(action);
242 |             connection.setUseCaches(false);
243 |            /*           java 1.6 does not support
244 |            requestProperty.forEach((k,v) -> connection.setRequestProperty(k, v));
245 |            */
246 |             if (requestProperty != null) {
247 |                 for (Map.Entry<String, String> entry : requestProperty.entrySet()) {
248 |                     connection.setRequestProperty(entry.getKey(), entry.getValue());
249 |                 }
250 |             }
251 |             connection.setRequestProperty("Content-Length", String.valueOf(postAsBytes.length));
252 |             connection.connect();
253 |             int code = connection.getResponseCode();
254 |             connection.disconnect();
255 |             return code;
256 |         } catch (UnsupportedEncodingException e) {
257 |             System.out.println(e);
258 |             return -1;
259 |         } catch (MalformedURLException malformedURLException) {
260 |             System.out.println(malformedURLException);
261 |             return -2;
262 |         } catch (ProtocolException protocolException) {
263 |             System.out.println(protocolException);
264 |             return -3;
265 |         } catch (IOException ioException) {
266 |             System.out.println(ioException);
267 |             return -4;
268 | 
269 |         }
270 |     }
271 | }
272 | 


--------------------------------------------------------------------------------
/src/main/java/utils/network/ReturnData.java:
--------------------------------------------------------------------------------
 1 | package utils.network;
 2 | 
 3 | import java.util.List;
 4 | import java.util.Map;
 5 | 
 6 | /**
 7 |  * Created by padeoe on 2016/5/12.
 8 |  */
 9 | public class ReturnData {
10 |     byte[] data;
11 |     Map<String, List<String>> headers;
12 | 
13 |     public ReturnData(byte[] data, Map<String, List<String>> headers) {
14 |         this.data = data;
15 |         this.headers = headers;
16 |     }
17 | 
18 |     public byte[] getData() {
19 |         return data;
20 |     }
21 | 
22 |     public Map<String, List<String>> getHeaders() {
23 |         return headers;
24 |     }
25 | }


--------------------------------------------------------------------------------