├── .dockerignore ├── .gitignore ├── Dockerfile ├── LICENSE ├── README.md ├── pom.xml └── src └── main └── java ├── Starter.java ├── cn └── chineseall │ ├── Batch.java │ ├── Book.java │ ├── Class.java │ ├── CoreService.java │ ├── Downloader.java │ ├── FixWhiteSpace.java │ ├── Node.java │ ├── PDFInfo.java │ ├── PDFReader.java │ ├── Tmp.java │ └── yus.java ├── com ├── njulib │ ├── Start.java │ ├── fix │ │ ├── FileRenamer.java │ │ ├── ListBook.java │ │ └── MissingPageCompletion.java │ ├── object │ │ ├── Book.java │ │ ├── BookClass.java │ │ ├── Books.java │ │ ├── InfoReader.java │ │ ├── RootBookClass.java │ │ ├── TerminalBookClass.java │ │ └── exception │ │ │ ├── BookDLException.java │ │ │ ├── BookPagesDLException.java │ │ │ └── PageDLException.java │ └── spider │ │ ├── BookDownloader.java │ │ ├── BookSearch.java │ │ └── NJULib.java └── sslibrary │ ├── Start.java │ ├── fix │ ├── FileRenamer.java │ ├── MissingPageCompletion.java │ └── Recovery.java │ ├── object │ ├── Book.java │ ├── BookClass.java │ ├── Books.java │ ├── InfoReader.java │ ├── RootBookClass.java │ ├── TerminalBookClass.java │ └── exception │ │ ├── BookDLException.java │ │ ├── BookPagesDLException.java │ │ └── PageDLException.java │ └── spider │ ├── BookDownloader.java │ ├── BookSearch.java │ ├── NJULib.java │ └── PDFGenerator.java └── utils ├── ImageMeger.java ├── conversion ├── MyDecoder.java ├── PDFMerge.java └── PDFTool.java └── network ├── MyByteArray.java ├── MyHttpRequest.java └── ReturnData.java /.dockerignore: -------------------------------------------------------------------------------- 1 | Dockerfile -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | target/ 2 | !.mvn/wrapper/maven-wrapper.jar 3 | 4 | ### STS ### 5 | .apt_generated 6 | .classpath 7 | .factorypath 8 | .project 9 | .settings 10 | .springBeans 11 | 12 | ### IntelliJ IDEA ### 13 | .idea 14 | *.iws 15 | *.iml 16 | *.ipr 17 | 18 | ### NetBeans ### 19 | nbproject/private/ 20 | build/ 21 | nbbuild/ 22 | dist/ 23 | nbdist/ 24 | .nb-gradle/ -------------------------------------------------------------------------------- /Dockerfile: -------------------------------------------------------------------------------- 1 | FROM maven:3-jdk-8 2 | 3 | WORKDIR /code 4 | ADD . . 5 | RUN mvn package 6 | RUN mv target/libpdf*-dependencies.jar target/libpdf.jar 7 | 8 | WORKDIR /ebook 9 | 10 | ENTRYPOINT [ "java", "-jar", "/code/target/libpdf.jar"] 11 | 12 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # NJU-lib-Downloader 2 | [超星电子书](http://www.sslibrary.com/)和[书香中国](http://sxnju.chineseall.cn/home/index)的电子书下载器 3 | 4 | 命令行程序。可以下载图书并自动合成PDF。 5 | 6 | ## 依赖 7 | * Java 8 + 8 | 9 | ## 使用方法 10 | 在 [release](https://github.com/padeoe/nju-lib-downloader/releases) 中下载发布的 jar 包,执行以下命令: 11 | ``` 12 | 用法: java -jar libpdf.jar [-c=] [-p=] [-t=] URL 13 | URL 书籍链接 14 | -c, --cache_path= 15 | 临时文件(分页pdf)存储路径,默认为当前路径 16 | -p, --path= pdf存储目录,默认为当前路径 17 | -t= 线程数量,默认为8 18 | 19 | 示例: java -jar libpdf.jar -t 8 http://sxnju.chineseall.cn/v3/book/detail/VPeZj 20 | java -jar libpdf.jar -t 8 -p /home/pdf/ -c /tmp/pdf http://img.sslibrary.com/n/slib/book/slib/10649113/65873989af6f4d809862aa11b16f650c/0e71a4d58ffba4e1b202d4b3fb30a81a.shtml?dxbaoku=false&deptid=275&fav=http%3A%2F%2Fwww.sslibrary.com%2Freader%2Fpdg%2Fpdgreader%3Fd%3Da1b248ecb4a78ba2087d8b5d0c5c950d%26ssid%3D10649113&fenlei=080401&spage=1&t=5&username=xxxxxx&view=-1 21 | 22 | ``` 23 | 24 | ### Docker 25 | 26 | ``` 27 | docker run --rm -v "$PWD":/ebook padeoe/nju-lib-downloader url 28 | ``` 29 | 30 |

特别感谢

31 | 32 | [@Nifury](https://github.com/Nifury) 33 | -------------------------------------------------------------------------------- /pom.xml: -------------------------------------------------------------------------------- 1 | 2 | 5 | 4.0.0 6 | 7 | com.padeoe.nju 8 | libpdf 9 | 0.2.11 10 | 11 | 12 | 1.8 13 | 1.8 14 | 15 | 16 | 17 | 18 | maven-assembly-plugin 19 | 20 | 21 | package 22 | 23 | single 24 | 25 | 26 | 27 | 28 | 29 | jar-with-dependencies 30 | 31 | 32 | 33 | Starter 34 | 35 | 36 | 37 | 38 | 39 | 40 | 41 | 42 | junit 43 | junit 44 | RELEASE 45 | 46 | 47 | org.jsoup 48 | jsoup 49 | 1.8.3 50 | 51 | 52 | 53 | com.itextpdf 54 | kernel 55 | 7.1.0 56 | 57 | 58 | 59 | 60 | com.itextpdf 61 | io 62 | 7.1.0 63 | 64 | 65 | 66 | 67 | com.itextpdf 68 | layout 69 | 7.1.0 70 | 71 | 72 | 73 | 74 | com.itextpdf 75 | forms 76 | 7.1.0 77 | 78 | 79 | 80 | 81 | com.itextpdf 82 | pdfa 83 | 7.1.0 84 | 85 | 86 | 87 | 88 | com.itextpdf 89 | sign 90 | 7.1.0 91 | 92 | 93 | 94 | 95 | com.itextpdf 96 | font-asian 97 | 7.1.0 98 | 99 | 100 | 101 | com.itextpdf 102 | pdftest 103 | 7.0.2 104 | 105 | 106 | 107 | org.zeroturnaround 108 | zt-exec 109 | 1.9 110 | 111 | 112 | 113 | 114 | org.apache.pdfbox 115 | pdfbox 116 | 2.0.16 117 | 118 | 119 | 120 | org.apache.pdfbox 121 | pdfbox-tools 122 | 2.0.16 123 | 124 | 125 | 126 | 127 | com.fasterxml.jackson.core 128 | jackson-core 129 | 2.10.1 130 | 131 | 132 | 133 | 134 | com.fasterxml.jackson.core 135 | jackson-databind 136 | 2.10.1 137 | 138 | 139 | 140 | 141 | info.picocli 142 | picocli 143 | 4.1.4 144 | 145 | 146 | 147 | 148 | 149 | 150 | 151 | -------------------------------------------------------------------------------- /src/main/java/Starter.java: -------------------------------------------------------------------------------- 1 | import com.sslibrary.spider.BookDownloader; 2 | import picocli.CommandLine; 3 | 4 | import java.nio.file.Paths; 5 | 6 | /** 7 | * Created by padeoe on 2017/9/8. 8 | */ 9 | public class Starter implements Runnable { 10 | @CommandLine.Option(names = {"-t"}, description = "线程数量") 11 | private int threadNumber = 8; 12 | 13 | @CommandLine.Parameters(paramLabel = "URL", description = "书籍链接") 14 | private String url; 15 | 16 | @CommandLine.Option(names = {"-p", "--path"}, description = "pdf存储目录") 17 | private String outputPath; 18 | 19 | @CommandLine.Option(names = {"-c", "--cache_path"}, description = "临时文件(分页pdf)存储路径") 20 | private String tmpPath; 21 | 22 | public static void main(String[] args) { 23 | System.setProperty("sun.java2d.cmm", "sun.java2d.cmm.kcms.KcmsServiceProvider"); 24 | int exitCode = new CommandLine(new Starter()).execute(args); 25 | System.exit(exitCode); 26 | } 27 | 28 | @Override 29 | public void run() { 30 | 31 | try { 32 | long begin = System.currentTimeMillis(); 33 | if (url.contains("chineseall.cn")) { 34 | String[] segments = url.split("/"); 35 | String bookId = segments[segments.length - 1]; 36 | cn.chineseall.Downloader bookDownloader = new cn.chineseall.Downloader(bookId, new cn.chineseall.CoreService("Maskeney", "147258")); 37 | bookDownloader.setThreadNumber(threadNumber); 38 | if (tmpPath != null) bookDownloader.setTmpPathDir(Paths.get(tmpPath)); 39 | if (outputPath != null) bookDownloader.setPath(Paths.get(outputPath)); 40 | bookDownloader.downloadBook(); 41 | } else { 42 | if (url.contains("img.sslibrary.com")) { 43 | BookDownloader bookDownloader = new BookDownloader(url); 44 | bookDownloader.setThreadNumber(threadNumber); 45 | if (outputPath != null) bookDownloader.setPath(outputPath); 46 | if (tmpPath != null) bookDownloader.setTmpPath(Paths.get(tmpPath)); 47 | bookDownloader.downloadBook(); 48 | } else { 49 | System.err.println("未能识别的url,请输入chineseall.cn或者img.sslibrary.com开头的书本url"); 50 | } 51 | } 52 | System.out.println("下载结束,耗时" + (System.currentTimeMillis() - begin) / 1000 + "秒"); 53 | } catch (Exception e) { 54 | e.printStackTrace(); 55 | } 56 | } 57 | } 58 | -------------------------------------------------------------------------------- /src/main/java/cn/chineseall/Batch.java: -------------------------------------------------------------------------------- 1 | package cn.chineseall; 2 | 3 | import java.io.IOException; 4 | import java.nio.file.Files; 5 | import java.nio.file.Paths; 6 | import java.util.stream.Stream; 7 | 8 | /** 9 | * Created by padeoe on 2017/4/14. 10 | */ 11 | public class Batch { 12 | public static void main(String[] args) { 13 | String className="D9"; 14 | if(args!=null&&args.length>0){ 15 | className=args[0]; 16 | } 17 | try { 18 | Stream bookStream=new Class(className).getNewBooks().flatMap(books -> books.stream().map(Book::toString)); 19 | //bookStream.forEach(book-> System.out.println(book)); 20 | Files.write(Paths.get("C:\\Users\\padeo\\Desktop\\法律书籍.txt"), (Iterable)bookStream::iterator); 21 | /* System.out.println(className+"分类共"+books.size()+"本书"); 22 | String finalClassName = className; 23 | books.parallelStream().*//*filter(book -> book.getAuthor().indexOf("(美")!=-1||book.getAuthor().indexOf("[美")!=-1).*//*forEach(book -> { 24 | Downloader bookDownloader = new Downloader(book, new CoreService("", "")); 25 | bookDownloader.setPath(Paths.get("/mnt/f/"+ finalClassName)); 26 | bookDownloader.setTmpPathDir(Paths.get("/mnt/f/tmp")); 27 | bookDownloader.setThreadNumber(2); 28 | if(!bookDownloader.downloadBook()){ 29 | BookDownloader.writeFile("/mnt/f/error.txt",book.getId()+" "+book.getName()); 30 | } 31 | });*/ 32 | } catch (IOException e) { 33 | e.printStackTrace(); 34 | } 35 | 36 | } 37 | } 38 | -------------------------------------------------------------------------------- /src/main/java/cn/chineseall/Book.java: -------------------------------------------------------------------------------- 1 | package cn.chineseall; 2 | 3 | import com.fasterxml.jackson.databind.ObjectMapper; 4 | import com.fasterxml.jackson.databind.node.ObjectNode; 5 | import org.jsoup.Jsoup; 6 | import org.jsoup.nodes.Document; 7 | import org.jsoup.nodes.Element; 8 | import org.jsoup.select.Elements; 9 | import utils.network.MyHttpRequest; 10 | 11 | import java.io.IOException; 12 | import java.util.ArrayList; 13 | import java.util.LinkedList; 14 | import java.util.List; 15 | 16 | /** 17 | * Created by padeoe on 2017/4/10. 18 | */ 19 | public class Book { 20 | String id; 21 | String idInt; 22 | String name; 23 | String press; 24 | String author; 25 | String publishDate; 26 | String introduction; 27 | String coverUrl; 28 | 29 | public String getId() { 30 | return id; 31 | } 32 | 33 | public void setId(String id) { 34 | this.id = id; 35 | } 36 | 37 | public String getIdInt() { 38 | return idInt; 39 | } 40 | 41 | public void setIdInt(String idInt) { 42 | this.idInt = idInt; 43 | } 44 | 45 | public String getName() { 46 | return name; 47 | } 48 | 49 | public void setName(String name) { 50 | this.name = name; 51 | } 52 | 53 | public String getPress() { 54 | return press; 55 | } 56 | 57 | public void setPress(String press) { 58 | this.press = press; 59 | } 60 | 61 | public String getAuthor() { 62 | return author; 63 | } 64 | 65 | public void setAuthor(String author) { 66 | this.author = author; 67 | } 68 | 69 | public String getPublishDate() { 70 | return publishDate; 71 | } 72 | 73 | public void setPublishDate(String publishDate) { 74 | this.publishDate = publishDate; 75 | } 76 | 77 | public String getIntroduction() { 78 | return introduction; 79 | } 80 | 81 | public void setIntroduction(String introduction) { 82 | this.introduction = introduction; 83 | } 84 | 85 | public String getCoverUrl() { 86 | return coverUrl; 87 | } 88 | 89 | public void setCoverUrl(String coverUrl) { 90 | this.coverUrl = coverUrl; 91 | } 92 | 93 | public Book(String id) { 94 | this.id = id; 95 | } 96 | 97 | public Book(String id, String name, String press, String author, String publishDate, String introduction, String coverUrl) { 98 | this.id = id; 99 | this.name = name; 100 | this.press = press; 101 | this.author = author; 102 | this.publishDate = publishDate; 103 | this.introduction = introduction; 104 | this.coverUrl = coverUrl; 105 | } 106 | 107 | @Override 108 | public String toString() { 109 | return "Book{" + 110 | "id='" + id + '\'' + 111 | ", name='" + name + '\'' + 112 | ", press='" + press + '\'' + 113 | ", author='" + author + '\'' + 114 | ", publishDate='" + publishDate + '\'' + 115 | ", introduction='" + introduction + '\'' + 116 | ", coverUrl='" + coverUrl + '\'' + 117 | '}'; 118 | } 119 | 120 | public List getOutline() throws IOException { 121 | for (int i = 0; i < 20; i++) { 122 | try { 123 | String url = CoreService.baseUrl + "/book/getDirectoryTree.jsps?bookId=" + idInt + "&type=PDF"; 124 | //http://sxnju.chineseall.cn/book/getDirectoryTree.jsps?bookId=10060602592&type=PDF&_=1504844448871 125 | String result = MyHttpRequest.get(url, null, "UTF-8", 3000); 126 | 127 | result = new ObjectMapper().readValue(result, ObjectNode.class).get("data").textValue(); 128 | 129 | Document doc = Jsoup.parse(result); 130 | Elements elements = doc.select("ul[id=directoryTree]"); 131 | return parseUL(elements.get(0)); 132 | } catch (Exception e) { 133 | if (i == 19) { 134 | throw e; 135 | } 136 | } 137 | 138 | } 139 | return null; 140 | } 141 | 142 | protected List parseUL(Element element) { 143 | List nodes = new LinkedList<>(); 144 | for (int i = 0; i < element.children().size(); i++) { 145 | Element child = element.child(i); 146 | if (child.nodeName().equals("li")) { 147 | nodes.add(parseLi(child)); 148 | } 149 | } 150 | return nodes; 151 | } 152 | 153 | protected Node parseLi(Element liElement) { 154 | Elements children = liElement.children(); 155 | if (children.size() == 1 && children.get(0).nodeName().equals("a")) { 156 | return parseA(children.get(0)); 157 | } 158 | Node root = new Node(); 159 | for (Element child : liElement.children()) { 160 | if (child.nodeName().equals("span")) { 161 | root = parseSpan(child); 162 | } 163 | if (child.nodeName().equals("ul")) { 164 | root.addAll(parseUL(child)); 165 | } 166 | } 167 | return root; 168 | } 169 | 170 | protected Node parseSpan(Element spanElement) { 171 | if (spanElement.children() != null) { 172 | Element trueNode = spanElement.child(0); 173 | return parseA(trueNode); 174 | } 175 | return new Node(); 176 | } 177 | 178 | protected Node parseA(Element aElement) { 179 | Node result = new Node(); 180 | String nodeTitle = aElement.text(); 181 | 182 | result.setTitle(nodeTitle); 183 | result.setPage(Integer.parseInt(aElement.attr("rel"))); 184 | return result; 185 | } 186 | 187 | public static List getBookFromHTML(String html) { 188 | Document doc = Jsoup.parse(html); 189 | Elements infoNode = doc.select("div[class=boxListLi5]"); 190 | List books = new ArrayList<>(30); 191 | if (infoNode != null) { 192 | for (int i = 0; i < infoNode.size(); i++) { 193 | String id = null, name = null, author = null, publishDate = null, press = null, introduction = null, coverUrl = null; 194 | Elements idNameNode = infoNode.get(i).select("a[href][title]"); 195 | if (idNameNode != null && idNameNode.size() > 0) { 196 | Elements coverImageNode = infoNode.get(i).select("img[src]"); 197 | if (coverImageNode != null && coverImageNode.size() > 0) { 198 | coverUrl = coverImageNode.attr("src"); 199 | } 200 | name = idNameNode.get(0).attr("title"); 201 | id = idNameNode.get(0).attr("href"); 202 | int id_index = id.indexOf("/book/detail/"); 203 | if (id_index != -1) { 204 | id = id.substring(id_index + "/book/detail/".length(), id.length()); 205 | } 206 | Elements pressNode = infoNode.get(i).select("span"); 207 | if (pressNode != null && pressNode.size() > 0) { 208 | String pressInfo = pressNode.get(0).text(); 209 | if (pressInfo != null) { 210 | String[] pressInfoArray = pressInfo.split("/"); 211 | if (pressInfoArray != null && pressInfoArray.length == 3) { 212 | author = pressInfoArray[0].trim(); 213 | press = pressInfoArray[1].trim(); 214 | publishDate = pressInfoArray[2].trim(); 215 | } 216 | } 217 | } 218 | Elements introNode = infoNode.get(i).select("p"); 219 | if (introNode != null && introNode.size() > 0) { 220 | introduction = introNode.text(); 221 | } 222 | } 223 | if (id != null) { 224 | Book book = new Book(id, name, press, author, publishDate, introduction, coverUrl); 225 | books.add(book); 226 | // System.out.println(book); 227 | } 228 | } 229 | } 230 | return books; 231 | } 232 | 233 | } 234 | -------------------------------------------------------------------------------- /src/main/java/cn/chineseall/Class.java: -------------------------------------------------------------------------------- 1 | package cn.chineseall; 2 | 3 | import org.jsoup.Jsoup; 4 | import org.jsoup.nodes.Document; 5 | import org.jsoup.select.Elements; 6 | import utils.network.MyHttpRequest; 7 | 8 | import java.io.FileWriter; 9 | import java.io.IOException; 10 | import java.util.HashSet; 11 | import java.util.List; 12 | import java.util.Set; 13 | import java.util.concurrent.atomic.AtomicInteger; 14 | import java.util.function.Function; 15 | import java.util.stream.IntStream; 16 | import java.util.stream.Stream; 17 | 18 | import static cn.chineseall.Book.getBookFromHTML; 19 | 20 | /** 21 | * Created by padeoe on 2017/4/11. 22 | */ 23 | public class Class { 24 | private AtomicInteger needGet = new AtomicInteger(1); 25 | private String id; 26 | public Class(String id){ 27 | this.id=id; 28 | } 29 | public int getBookSize() throws IOException { 30 | String url= CoreService.baseUrl+"/org/show/sort/"+id+"/0"; 31 | String result= MyHttpRequest.get(url,null,"UTF-8",3000); 32 | return getBookSizeFromHtml(result); 33 | } 34 | 35 | public static int getBookSizeFromHtml(String html){ 36 | Document doc= Jsoup.parse(html); 37 | Elements sizeNode=doc.select("input[id=totalSize]"); 38 | if(sizeNode!=null&&sizeNode.size()>0){ 39 | String sizeString=sizeNode.attr("value"); 40 | if(sizeString!=null){ 41 | int sizeInt= Integer.parseInt(sizeString); 42 | return sizeInt; 43 | } 44 | } 45 | return -1; 46 | } 47 | public List getBooks(int page) { 48 | String url= CoreService.baseUrl+"/org/show/sort/"+id+"/"+page; 49 | String result= null; 50 | try { 51 | result = MyHttpRequest.get(url,null,"UTF-8",3000); 52 | } catch (IOException e) { 53 | e.printStackTrace(); 54 | } 55 | return getBookFromHTML(result); 56 | } 57 | 58 | public List getNewBooks(int page) { 59 | String url= CoreService.baseUrl+"/org/show/sort/"+this.id+"/"+page; 60 | String result= null; 61 | try { 62 | result = MyHttpRequest.get(url,null,"UTF-8",3000); 63 | } catch (IOException e) { 64 | e.printStackTrace(); 65 | } 66 | return getBookFromHTML(result); 67 | } 68 | 69 | public Stream> getNewBooks() throws IOException { 70 | int size= getBookSizeFromHtml(MyHttpRequest.get(CoreService.baseUrl+"/org/show/sort/"+this.id+"/0",null,"UTF-8",3000)); 71 | int lastPage = size / 30 + 1;//最后一页的页码 72 | return IntStream.range(0,lastPage+1).parallel().mapToObj(page -> getNewBooks(page)); 73 | } 74 | 75 | 76 | 77 | public Stream> getAllBooks() throws IOException { 78 | int size= getBookSize(); 79 | int lastPage = size / 30 + 1;//最后一页的页码 80 | return IntStream.range(0,lastPage+1).parallel().mapToObj(page -> getBooks(page)); 81 | /* int threadNumber=10; 82 | 83 | Set books = new HashSet<>(); 84 | List threadList = new ArrayList<>(); 85 | 86 | AtomicInteger needGettedPage = new AtomicInteger(0);//需要获取的页码 87 | 88 | //开始多线程刷所有页码 89 | for (int threadN = 0; threadN < threadNumber; threadN++) { 90 | threadList.add(new PageGetThread(needGettedPage, lastPage)); 91 | } 92 | 93 | for (PageGetThread thread : threadList) { 94 | thread.start(); 95 | } 96 | for (PageGetThread thread : threadList) { 97 | try { 98 | thread.join(); 99 | } catch (InterruptedException e) { 100 | e.printStackTrace(); 101 | } 102 | } 103 | threadList.forEach(pageGetThread -> books.addAll(pageGetThread.getThreadBooks())); 104 | return books;*/ 105 | } 106 | 107 | /** 108 | * 获取所有图书列表的线程 109 | */ 110 | class PageGetThread extends Thread { 111 | Set books = new HashSet<>(); 112 | AtomicInteger needGettedPage; 113 | int lastPage; 114 | 115 | public PageGetThread(AtomicInteger needGettedPage, int lastPage) { 116 | this.needGettedPage = needGettedPage; 117 | this.lastPage = lastPage; 118 | } 119 | 120 | @Override 121 | public void run() { 122 | while (true) { 123 | int gettingpage = needGettedPage.getAndIncrement(); 124 | if (gettingpage <= lastPage) { 125 | // try { 126 | // System.out.println("正在获取第"+gettingpage+"页"); 127 | books.addAll(getBooks(gettingpage)); 128 | /* } catch (IOException e) { 129 | e.printStackTrace(); 130 | }*/ 131 | } else { 132 | break; 133 | } 134 | } 135 | } 136 | 137 | public Set getThreadBooks() { 138 | return books; 139 | } 140 | } 141 | 142 | private static StringBuffer output = new StringBuffer("\n" + 143 | "" + 144 | "\n" + 145 | "\n" + 146 | /* " \n" +*/ 147 | " \n" + 148 | " \n" + 149 | " \n" + 150 | " \n" + 151 | "\n"); 152 | 153 | private static String getBookLineInTable(Book book) { 154 | if (book != null) { 155 | StringBuffer stringBuffer = new StringBuffer(); 156 | stringBuffer.append("\n"); 157 | stringBuffer.append(""); 158 | // stringBuffer.append(getAttr(Book::getId, book)); 159 | /*stringBuffer.append(getAttr(Book::getName, book));*/ 160 | stringBuffer.append(getAttr(Book::getAuthor, book)); 161 | stringBuffer.append(getAttr(Book::getPublishDate, book)); 162 | stringBuffer.append(getAttr(Book::getPress, book)); 163 | stringBuffer.append(""); 164 | return stringBuffer.toString(); 165 | } else { 166 | return null; 167 | } 168 | 169 | } 170 | 171 | private static String getAttr(Function attrGetter, Book book) { 172 | StringBuffer stringBuffer = new StringBuffer(); 173 | stringBuffer.append("\n"); 176 | return stringBuffer.toString(); 177 | } 178 | public static void main(String[] args) { 179 | try { 180 | 181 | new Class("D9").getAllBooks().forEach(bookList -> 182 | bookList.forEach(book -> { 183 | output.append(getBookLineInTable(book)); 184 | }) 185 | ); 186 | output.append("
编号书名作者出版年份出版社
"+book.getName()+"
"); 174 | stringBuffer.append(attrGetter.apply(book)); 175 | stringBuffer.append("
\n"); 187 | output.append(""); 188 | FileWriter writer = null; 189 | try { 190 | writer = new FileWriter("D9.html", false); 191 | writer.write(output.toString()); 192 | writer.close(); 193 | } catch (IOException e) { 194 | e.printStackTrace(); 195 | } 196 | 197 | } catch (IOException e) { 198 | e.printStackTrace(); 199 | } 200 | } 201 | 202 | } 203 | -------------------------------------------------------------------------------- /src/main/java/cn/chineseall/CoreService.java: -------------------------------------------------------------------------------- 1 | package cn.chineseall; 2 | 3 | import java.io.IOException; 4 | import java.io.OutputStream; 5 | import java.net.*; 6 | import java.util.HashMap; 7 | import java.util.List; 8 | import java.util.Map; 9 | 10 | /** 11 | * Created by padeoe on 2017/4/10. 12 | */ 13 | public class CoreService { 14 | private String username; 15 | private String password; 16 | public static final String baseUrl = "http://sxqh.chineseall.cn"; 17 | public CoreService(String username, String password){ 18 | this.username=username; 19 | this.password=password; 20 | } 21 | public String getSession() throws IOException { 22 | Map attr = new HashMap<>(); 23 | attr.put("Referer", baseUrl+"/sso/login.jsps?redirectUrl="+baseUrl); 24 | attr.put("Origin", baseUrl); 25 | String result = getCookie("userName=" + username + "&userPass=" + password + "&redirectUrl="+ URLEncoder.encode(baseUrl), baseUrl + "/sso/logon.jsps", attr, "UTF-8", 3000); 26 | return result; 27 | } 28 | 29 | private String getCookie(String data, String URL, Map requestProperty,String inputEncoding, int timeout) throws IOException { 30 | byte[] dataAsBytes = new byte[]{}; 31 | if (data != null) { 32 | dataAsBytes = data.getBytes(inputEncoding); 33 | } 34 | java.net.URL url = new URL(URL); 35 | HttpURLConnection connection = (HttpURLConnection) url 36 | .openConnection(/*new Proxy(Proxy.Type.SOCKS, new InetSocketAddress("127.0.0.1", 1080))*/); 37 | connection.setConnectTimeout(timeout); 38 | connection.setRequestMethod("POST"); 39 | connection.setDoOutput(true); 40 | 41 | if (requestProperty != null) { 42 | for (Map.Entry entry : requestProperty.entrySet()) { 43 | connection.setRequestProperty(entry.getKey(), entry.getValue()); 44 | } 45 | } 46 | if (data != null) { 47 | connection.setRequestProperty("Content-Length", String.valueOf(dataAsBytes.length)); 48 | } 49 | connection.setInstanceFollowRedirects(false); 50 | connection.connect(); 51 | if (data != null) { 52 | OutputStream outputStream = null; 53 | try { 54 | outputStream = connection.getOutputStream(); 55 | outputStream.write(dataAsBytes); 56 | } finally { 57 | if (outputStream != null) { 58 | outputStream.close(); 59 | } 60 | 61 | } 62 | } 63 | utils.network.MyByteArray myByteArray = new utils.network.MyByteArray(); 64 | Map> headers = connection.getHeaderFields(); 65 | 66 | 67 | connection.disconnect(); 68 | byte[] bytes = new byte[myByteArray.getSize()]; 69 | System.arraycopy(myByteArray.getBuffer(), 0, bytes, 0, bytes.length); 70 | return headers.get("Set-Cookie").get(0); 71 | } 72 | } 73 | -------------------------------------------------------------------------------- /src/main/java/cn/chineseall/FixWhiteSpace.java: -------------------------------------------------------------------------------- 1 | package cn.chineseall; 2 | 3 | import java.io.File; 4 | import java.io.IOException; 5 | import java.io.RandomAccessFile; 6 | import java.nio.file.Files; 7 | import java.util.LinkedList; 8 | import java.util.List; 9 | 10 | /** 11 | * Created by padeoe on 2017/4/23. 12 | */ 13 | public class FixWhiteSpace { 14 | public static void main(String[] args) { 15 | long begin = System.currentTimeMillis(); 16 | List allDir = getAllDir(new File(args[0])); 17 | System.out.println("总数本数 "+allDir.size()); 18 | allDir.parallelStream().forEach(file -> handleDir(file)); 19 | System.out.println((System.currentTimeMillis() - begin) ); 20 | } 21 | 22 | public static List getAllDir(File rootDir){ 23 | if(rootDir.isFile()){ 24 | return null; 25 | } 26 | List result=new LinkedList<>(); 27 | for (File subDir:rootDir.listFiles()){ 28 | if(subDir.getName().startsWith("《")){ 29 | result.add(subDir); 30 | } 31 | else { 32 | result.addAll(getAllDir(subDir)); 33 | } 34 | } 35 | return result; 36 | } 37 | 38 | 39 | public static void handleDir(File dir){ 40 | File []files=dir.listFiles(); 41 | if(files.length>0){ 42 | if(!files[0].getName().endsWith(".txt")){ 43 | if(files[0].length()%1024==0){ 44 | System.out.println(dir.getName()); 45 | fixDir(dir); 46 | } 47 | } 48 | else{ 49 | if(files[1].length()%1024==0){ 50 | System.out.println(dir.getName()); 51 | fixDir(dir); 52 | } 53 | } 54 | } 55 | else{ 56 | System.out.println("空文件夹"+dir.getName()); 57 | } 58 | 59 | } 60 | public static void fixDir(File dir){ 61 | for(File file:dir.listFiles()){ 62 | try { 63 | byte[]imageByte=Files.readAllBytes(file.toPath()); 64 | int length=imageByte.length; 65 | for(int i=imageByte.length-1;i>-1;i--){ 66 | if(imageByte[i]!=0){ 67 | length=i+1; 68 | break; 69 | } 70 | } 71 | RandomAccessFile randomAccessFile = new RandomAccessFile(file, "rw"); 72 | randomAccessFile.setLength(length); 73 | randomAccessFile.close(); 74 | } catch (IOException e) { 75 | e.printStackTrace(); 76 | } 77 | } 78 | } 79 | } 80 | -------------------------------------------------------------------------------- /src/main/java/cn/chineseall/Node.java: -------------------------------------------------------------------------------- 1 | package cn.chineseall; 2 | 3 | import java.util.LinkedList; 4 | import java.util.List; 5 | 6 | /** 7 | * Created by padeoe on 2017/4/11. 8 | */ 9 | public class Node { 10 | private String title; 11 | private int page; 12 | private Listchildren=new LinkedList<>(); 13 | public void addChild(Node node){ 14 | children.add(node); 15 | } 16 | 17 | public String getTitle() { 18 | return title; 19 | } 20 | 21 | public void setTitle(String title) { 22 | this.title = title; 23 | } 24 | 25 | public int getPage() { 26 | return page; 27 | } 28 | 29 | public void setPage(int page) { 30 | this.page = page; 31 | } 32 | 33 | public List getChildren() { 34 | return children; 35 | } 36 | 37 | public Node addAll(Listnodes){ 38 | nodes.forEach(node -> children.add(node)); 39 | return this; 40 | } 41 | } 42 | -------------------------------------------------------------------------------- /src/main/java/cn/chineseall/PDFInfo.java: -------------------------------------------------------------------------------- 1 | package cn.chineseall; 2 | 3 | import com.itextpdf.kernel.pdf.*; 4 | import com.itextpdf.kernel.pdf.action.PdfAction; 5 | import com.itextpdf.kernel.pdf.navigation.PdfExplicitDestination; 6 | 7 | import java.io.IOException; 8 | import java.util.List; 9 | 10 | /** 11 | * Created by padeoe on 2017/4/11. 12 | */ 13 | public class PDFInfo { 14 | 15 | public static String getTitle(String src){ 16 | try { 17 | PdfDocument pdfDoc = new PdfDocument(new PdfReader(src)); 18 | PdfDocumentInfo info = pdfDoc.getDocumentInfo(); 19 | String title=info.getTitle(); 20 | pdfDoc.close(); 21 | return title; 22 | } catch (Exception e) { 23 | return null; 24 | } 25 | } 26 | 27 | public static void addBookMark(Book book,String src,String dest){ 28 | PdfDocument pdfDoc = null; 29 | try { 30 | pdfDoc = new PdfDocument(new PdfReader(src), new PdfWriter(dest)); 31 | PdfOutline root = pdfDoc.getOutlines(false); 32 | PdfDocumentInfo info=pdfDoc.getDocumentInfo(); 33 | info.setTitle(book.getName()); 34 | 35 | info.setAuthor(CoreService.baseUrl+"/book/"+book.getId()); 36 | List nodes = book.getOutline(); 37 | addOutline(nodes, root, pdfDoc); 38 | pdfDoc.close(); 39 | } catch (IOException e) { 40 | e.printStackTrace(); 41 | } 42 | } 43 | 44 | 45 | private static void addOutline(List nodes, PdfOutline root, PdfDocument pdfDocument) { 46 | for (Node node : nodes) { 47 | PdfOutline child = root.addOutline(node.getTitle()); 48 | child.addAction(PdfAction.createGoTo( 49 | PdfExplicitDestination.createFitH(pdfDocument.getPage(node.getPage()), 50 | pdfDocument.getPage(node.getPage()).getPageSize().getTop()))); 51 | addOutline(node.getChildren(), child, pdfDocument); 52 | 53 | } 54 | } 55 | 56 | } 57 | -------------------------------------------------------------------------------- /src/main/java/cn/chineseall/PDFReader.java: -------------------------------------------------------------------------------- 1 | package cn.chineseall; 2 | import com.itextpdf.kernel.font.PdfFont; 3 | import com.itextpdf.kernel.geom.Rectangle; 4 | import com.itextpdf.kernel.pdf.PdfDocument; 5 | import com.itextpdf.kernel.pdf.PdfReader; 6 | import com.itextpdf.kernel.pdf.canvas.parser.EventType; 7 | import com.itextpdf.kernel.pdf.canvas.parser.PdfCanvasProcessor; 8 | import com.itextpdf.kernel.pdf.canvas.parser.data.IEventData; 9 | import com.itextpdf.kernel.pdf.canvas.parser.data.TextRenderInfo; 10 | import com.itextpdf.kernel.pdf.canvas.parser.filter.TextRegionEventFilter; 11 | import com.itextpdf.kernel.pdf.canvas.parser.listener.FilteredEventListener; 12 | import com.itextpdf.kernel.pdf.canvas.parser.listener.LocationTextExtractionStrategy; 13 | import com.itextpdf.kernel.pdf.canvas.parser.listener.SimpleTextExtractionStrategy; 14 | import com.itextpdf.test.annotations.type.SampleTest; 15 | import org.junit.Assert; 16 | import org.junit.BeforeClass; 17 | import org.junit.Test; 18 | import org.junit.experimental.categories.Category; 19 | 20 | import java.io.File; 21 | import java.io.IOException; 22 | 23 | @Category(SampleTest.class) 24 | public class PDFReader { 25 | // public static final String SRC = "C:\\Users\\padeo\\Desktop\\nameddestinations.pdf"; 26 | public static final String SRC = "C:\\Users\\padeo\\Desktop\\0081.pdf"; 27 | 28 | @BeforeClass 29 | public static void main() throws IOException { 30 | // PdfReader pdfReader = new PdfReader(file); 31 | // PdfReaderContentParser parser = new PdfReaderContentParser(pdfReader); 32 | // 33 | // strategy = parser.processContent(currentPage, new SimpleTextExtractionStrategy()); 34 | // content = strategy.getResultantText(); 35 | 36 | File file = new File(SRC); 37 | file.getParentFile().mkdirs(); 38 | } 39 | 40 | @Test 41 | public void manipulatePdf() throws IOException { 42 | PdfDocument pdfDoc = new PdfDocument(new PdfReader(SRC)); 43 | Rectangle rect = new Rectangle(36, 750, 523, 56); 44 | 45 | FontFilter fontFilter = new FontFilter(rect); 46 | FilteredEventListener listener = new FilteredEventListener(); 47 | LocationTextExtractionStrategy extractionStrategy = listener.attachEventListener(new LocationTextExtractionStrategy(), fontFilter); 48 | new PdfCanvasProcessor(listener).processPageContent(pdfDoc.getFirstPage()); 49 | 50 | String actualText = extractionStrategy.getResultantText(); 51 | System.out.println(actualText); 52 | 53 | pdfDoc.close(); 54 | 55 | 56 | } 57 | 58 | 59 | class FontFilter extends TextRegionEventFilter { 60 | public FontFilter(Rectangle filterRect) { 61 | super(filterRect); 62 | } 63 | 64 | @Override 65 | public boolean accept(IEventData data, EventType type) { 66 | return true; 67 | // if (type.equals(EventType.RENDER_TEXT)) { 68 | // TextRenderInfo renderInfo = (TextRenderInfo) data; 69 | // 70 | // PdfFont font = renderInfo.getFont(); 71 | // if (null != font) { 72 | // String fontName = font.getFontProgram().getFontNames().getFontName(); 73 | // System.out.println(fontName); 74 | // return fontName.equals("FZHTK-GBK1-0200020e4"); 75 | // //FZHTK-GBK1-0200020e4 76 | // //return fontName.endsWith("Bold") || fontName.endsWith("Oblique"); 77 | // } 78 | // } 79 | // return false; 80 | } 81 | } 82 | } -------------------------------------------------------------------------------- /src/main/java/cn/chineseall/Tmp.java: -------------------------------------------------------------------------------- 1 | package cn.chineseall; 2 | 3 | import java.util.Arrays; 4 | import java.util.List; 5 | 6 | /** 7 | * Created by padeo on 2017/8/14. 8 | */ 9 | public class Tmp { 10 | public static void main(String[] args) { 11 | List strings = Arrays.asList(new String[]{""}); 12 | strings.add("233"); 13 | } 14 | } 15 | -------------------------------------------------------------------------------- /src/main/java/cn/chineseall/yus.java: -------------------------------------------------------------------------------- 1 | package cn.chineseall; 2 | 3 | import java.io.InputStream; 4 | import java.net.HttpURLConnection; 5 | import java.net.URL; 6 | 7 | public class yus { 8 | public static void main(String[] args) throws Exception { 9 | HttpURLConnection connection = (HttpURLConnection) new URL("http://sxqh.chineseall.cn/v3/book/content/VPeZj/pdf/9").openConnection(); 10 | connection.setRequestProperty("Accept", "*/*"); 11 | connection.setRequestProperty("Accept-Encoding", "gzip, deflate"); 12 | connection.setRequestProperty("Accept-Language", "zh-CN,zh;q=0.8"); 13 | connection.setRequestProperty("Cache-Control", "no-cache"); 14 | connection.setRequestProperty("Connection", "keep-alive"); 15 | connection.setRequestProperty("Cookie", "JSESSIONID=6BC691FD580D2AFBCF38F4E9CB60FEC9"); 16 | connection.setRequestProperty("Pragma", "no-cache"); 17 | connection.setRequestProperty("User-Agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.113 Safari/537.36"); 18 | connection.connect(); 19 | String location = connection.getHeaderField("Location"); 20 | String cookie = connection.getHeaderField("Set-Cookie"); 21 | cookie = cookie.substring(0, cookie.indexOf(';')); 22 | 23 | connection = (HttpURLConnection) new URL(location).openConnection(); 24 | connection.setRequestProperty("Accept", "*/*"); 25 | connection.setRequestProperty("Accept-Encoding", "gzip, deflate"); 26 | connection.setRequestProperty("Accept-Language", "zh-CN,zh;q=0.8"); 27 | connection.setRequestProperty("Cache-Control", "no-cache"); 28 | connection.setRequestProperty("Connection", "keep-alive"); 29 | connection.setRequestProperty("Cookie", cookie); 30 | connection.setRequestProperty("Pragma", "no-cache"); 31 | connection.setRequestProperty("User-Agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.113 Safari/537.36"); 32 | try (InputStream is = connection.getInputStream()) { 33 | utils.network.MyByteArray myByteArray = new utils.network.MyByteArray(); 34 | while (true) { 35 | myByteArray.ensureCapacity(4096); 36 | int len = is.read(myByteArray.getBuffer(), myByteArray.getOffset(), 4096); 37 | if (len == -1) { 38 | break; 39 | } 40 | myByteArray.addOffset(len); 41 | } 42 | byte[] bytes = new byte[myByteArray.getSize()]; 43 | System.arraycopy(myByteArray.getBuffer(),0,bytes,0,bytes.length); 44 | System.out.println(new String(bytes)); 45 | //System.out.println(new String(is.readAllBytes())); 46 | } 47 | } 48 | } -------------------------------------------------------------------------------- /src/main/java/com/njulib/Start.java: -------------------------------------------------------------------------------- 1 | package com.njulib; 2 | 3 | import com.njulib.object.BookClass; 4 | 5 | import java.io.IOException; 6 | 7 | /** 8 | * @author padeoe 9 | * @Date: 2016/12/10 10 | */ 11 | public class Start { 12 | /** 13 | * 一个使用示例。请修改下面代码的两个文件存储路径,再运行。 14 | * 当前示例会下载计算机分类下所有书。 15 | * 下载过程中可以终止程序从而终止下载。下一次下载时会跳过下载分类中已有的书本。 16 | * 17 | * @param args 18 | */ 19 | public static void main(String[] args) { 20 | //创建一个书目分类,此处定义的是0T0P3010 计算机类,具体解释请参考中图法 21 | // 格式必须和南京大学馆藏数字化图书平台一致 22 | BookClass root = new BookClass("0T0P"); 23 | try { 24 | System.out.println(root.queryBooksSize()); 25 | /*root.downloadWithCataDir("G:\\", 5, "G:\\未分类\\pageDLFail.txt");*/ 26 | } catch (IOException e) { 27 | e.printStackTrace(); 28 | } 29 | 30 | 31 | } 32 | } 33 | -------------------------------------------------------------------------------- /src/main/java/com/njulib/fix/FileRenamer.java: -------------------------------------------------------------------------------- 1 | package com.njulib.fix; 2 | 3 | import java.io.File; 4 | import java.io.FileInputStream; 5 | import java.io.IOException; 6 | import java.io.InputStream; 7 | import java.nio.file.Files; 8 | import java.nio.file.Path; 9 | import java.nio.file.Paths; 10 | import java.util.Arrays; 11 | 12 | /** 13 | * 重命名之前版本程序下载的文件。 14 | * 之前版本造成了下载的文件命名不合理。 15 | * 16 | * @author padeoe 17 | * @Date: 2016/12/13 18 | */ 19 | public class FileRenamer { 20 | public static void main(String args[]) { 21 | renameZero("G:\\com.njulib.Test\\"); 22 | } 23 | 24 | public static void renameZero(String rootDirPath) { 25 | Path root = Paths.get(rootDirPath); 26 | File rootDir = root.toFile(); 27 | if (rootDir.isDirectory()) { 28 | File dirs[] = rootDir.listFiles(); 29 | Arrays.asList(dirs).parallelStream().forEach(FileRenamer::handleEndDir); 30 | } else { 31 | System.out.println("根目录不是目录,终止"); 32 | } 33 | } 34 | 35 | public static void handleEndDir(File dir) { 36 | if (dir.isDirectory()) { 37 | System.out.println("正在处理" + dir.getName()); 38 | File files[] = dir.listFiles(); 39 | Arrays.asList(files).parallelStream().forEach(file -> rename(dir, file)); 40 | } else { 41 | System.out.println(dir.getName() + "不是目录,跳过"); 42 | } 43 | } 44 | 45 | private static void rename(File dir, File file) { 46 | String name = file.getName(); 47 | if (name.endsWith("png") || name.endsWith("jpg")) { 48 | String prefix = name.substring(0, name.indexOf('.')); 49 | name = name.replaceAll(prefix, String.format("%04d", Integer.parseInt(prefix))); 50 | try { 51 | Files.move(file.toPath(), new File(dir.getPath() + "\\" + name).toPath()); 52 | } catch (IOException e) { 53 | System.out.println(file.toString()); 54 | } 55 | } 56 | } 57 | 58 | public static void renameSuffix(String rootDirPath) { 59 | Path root = Paths.get(rootDirPath); 60 | File rootDir = root.toFile(); 61 | if (rootDir.isDirectory()) { 62 | File dirs[] = rootDir.listFiles(); 63 | Arrays.asList(dirs).parallelStream().forEach(FileRenamer::imageEndDir); 64 | } else { 65 | System.out.println("根目录不是目录,终止"); 66 | } 67 | } 68 | 69 | public static void imageEndDir(File dir) { 70 | if (dir.isDirectory()) { 71 | System.out.println("正在处理" + dir.getName()); 72 | File files[] = dir.listFiles(); 73 | for (File file : files) { 74 | String name = file.getName(); 75 | String prefix = name.substring(0, name.indexOf('.')); 76 | String trueSuffix = getImageSuffix(file); 77 | if ((name.endsWith("png") || name.endsWith("jpg")) && trueSuffix != null && !name.endsWith(trueSuffix)) { 78 | name = prefix + "." + trueSuffix; 79 | // System.out.println("需要修改为"+name); 80 | try { 81 | Files.move(file.toPath(), new File(dir.getPath() + "\\" + name).toPath()); 82 | } catch (IOException e) { 83 | System.out.println("修改出错" + file.toString()); 84 | } 85 | } 86 | } 87 | } else { 88 | System.out.println(dir.getName() + "不是目录,跳过"); 89 | } 90 | } 91 | 92 | 93 | public static String getImageSuffix(File image) { 94 | FileInputStream fileInputStream; 95 | InputStream inputStream; 96 | try { 97 | fileInputStream = new FileInputStream(image); 98 | inputStream = fileInputStream; 99 | byte[] array = new byte[10]; 100 | inputStream.read(array, 0, 10); 101 | if (array[6] == 'J' && array[7] == 'F' && array[8] == 'I' && array[9] == 'F') { 102 | inputStream.close(); 103 | return "jpg"; 104 | } else { 105 | inputStream.close(); 106 | return "png"; 107 | } 108 | } catch (IOException e) { 109 | e.printStackTrace(); 110 | } 111 | return null; 112 | 113 | } 114 | } 115 | -------------------------------------------------------------------------------- /src/main/java/com/njulib/fix/ListBook.java: -------------------------------------------------------------------------------- 1 | package com.njulib.fix; 2 | 3 | import com.njulib.object.InfoReader; 4 | import com.njulib.object.Book; 5 | 6 | import java.io.File; 7 | import java.io.FileWriter; 8 | import java.io.IOException; 9 | import java.util.*; 10 | import java.util.function.Function; 11 | import java.util.stream.Collectors; 12 | import java.util.stream.Stream; 13 | 14 | /** 15 | * Created by padeoe on 2017/4/25. 16 | */ 17 | public class ListBook { 18 | private static int id = 1; 19 | private static StringBuffer output = new StringBuffer("\n" + 20 | "" + 21 | "\n" + 22 | "\n" + 23 | " \n" + 24 | " \n" + 25 | " \n" + 26 | " \n" + 27 | " \n" + 28 | " \n" + 29 | "\n"); 30 | 31 | public static void main(String[] args) { 32 | getAllBooks(new File(args[0])).forEach(book -> output.append(getBookLineInTable(book))); 33 | output.append("
id编号书名作者出版年份分类
\n"); 34 | output.append(""); 35 | FileWriter writer = null; 36 | try { 37 | writer = new FileWriter("out.html", false); 38 | writer.write(output.toString()); 39 | writer.close(); 40 | } catch (IOException e) { 41 | e.printStackTrace(); 42 | } 43 | 44 | } 45 | 46 | public static List getAllBooks(File rootDir) { 47 | List result = new LinkedList<>(); 48 | for (File subDir : rootDir.listFiles()) { 49 | if (subDir.getName().startsWith("《")) { 50 | File infoFile = subDir.toPath().resolve("info.txt").toFile(); 51 | if (infoFile.exists()) { 52 | result.add(new InfoReader(infoFile.getPath()).read()); 53 | } 54 | 55 | } else { 56 | result.addAll(getAllBooks(subDir)); 57 | } 58 | } 59 | return result; 60 | } 61 | 62 | public static class BookAndDir { 63 | Book book; 64 | File Dir; 65 | 66 | public BookAndDir(Book book, File dir) { 67 | this.book = book; 68 | Dir = dir; 69 | } 70 | 71 | public Book getBook() { 72 | return book; 73 | } 74 | 75 | public void setBook(Book book) { 76 | this.book = book; 77 | } 78 | 79 | public File getDir() { 80 | return Dir; 81 | } 82 | 83 | public void setDir(File dir) { 84 | Dir = dir; 85 | } 86 | } 87 | 88 | /** 89 | * 获取目录下所有书籍 90 | * 91 | * @param rootDir 92 | * @return 93 | */ 94 | public static Stream getAllBooksAndDir(File rootDir) { 95 | Stream inputFileStream = Arrays.stream(rootDir.listFiles()); 96 | return inputFileStream.flatMap(subDir -> { 97 | if (subDir.getName().startsWith("《")) { 98 | File infoFile = subDir.toPath().resolve("info.txt").toFile(); 99 | if (infoFile.exists()) { 100 | return Arrays.stream(new BookAndDir[]{new BookAndDir(new InfoReader(infoFile.getPath()).read(), subDir)}); 101 | } 102 | return null; 103 | } else { 104 | return getAllBooksAndDir(subDir); 105 | } 106 | }).filter(bookAndDir -> bookAndDir.getBook() != null); 107 | } 108 | 109 | private static String getBookLineInTable(Book book) { 110 | if (book != null) { 111 | StringBuffer stringBuffer = new StringBuffer(); 112 | stringBuffer.append("\n"); 113 | stringBuffer.append(getLine(id + "")); 114 | id++; 115 | stringBuffer.append(getAttr(Book::getId, book)); 116 | stringBuffer.append(getAttr(Book::getName, book)); 117 | stringBuffer.append(getAttr(Book::getAuthor, book)); 118 | stringBuffer.append(getAttr(Book::getPublishDate, book)); 119 | stringBuffer.append(getAttr(Book::getDetailBookClass, book)); 120 | stringBuffer.append(""); 121 | return stringBuffer.toString(); 122 | } else { 123 | return null; 124 | } 125 | 126 | } 127 | 128 | private static String getAttr(Function attrGetter, Book book) { 129 | return getLine(attrGetter.apply(book)); 130 | } 131 | 132 | private static String getLine(String content) { 133 | StringBuffer stringBuffer = new StringBuffer(); 134 | stringBuffer.append(""); 135 | stringBuffer.append(content); 136 | stringBuffer.append("\n"); 137 | return stringBuffer.toString(); 138 | } 139 | } 140 | -------------------------------------------------------------------------------- /src/main/java/com/njulib/fix/MissingPageCompletion.java: -------------------------------------------------------------------------------- 1 | package com.njulib.fix; 2 | 3 | import com.njulib.spider.BookDownloader; 4 | 5 | import java.io.FileWriter; 6 | import java.io.IOException; 7 | import java.nio.file.Files; 8 | import java.nio.file.Paths; 9 | import java.util.Iterator; 10 | import java.util.List; 11 | import java.util.regex.Matcher; 12 | import java.util.regex.Pattern; 13 | 14 | /** 15 | * 读取下载日志中的错误,进行缺页补全。 16 | * 17 | * @author padeoe 18 | * Date: 2016/12/09 19 | */ 20 | public class MissingPageCompletion { 21 | private String logLocation = Paths.get(System.getProperty("user.dir"), BookDownloader.ERROR_LOG_NAME).toString(); 22 | private Pattern pattern = Pattern.compile("PageDLException\\{url='(.*)', location='(.*)'\\}"); 23 | 24 | /** 25 | * 创建一个{@code MissingPageCompletion}对象并将日志路径指定为{@code logLocation} 26 | * 27 | * @param logLocation 日志文件路径 28 | */ 29 | public MissingPageCompletion(String logLocation) { 30 | this.logLocation = logLocation; 31 | } 32 | 33 | /** 34 | * 读取日志中所有下载失败的单页信息并重新下载一次。 35 | * 重新下载的日志会输入到原日志文件中 36 | */ 37 | public void complete() { 38 | try { 39 | List lines = Files.readAllLines(Paths.get(logLocation)); 40 | Iterator iterator = lines.iterator(); 41 | while (iterator.hasNext()) { 42 | String line = iterator.next(); 43 | Matcher matcher = pattern.matcher(line); 44 | String url, location; 45 | if (matcher.find()) { 46 | url = matcher.group(1); 47 | location = matcher.group(2); 48 | System.out.println(url + " " + location); 49 | try { 50 | BookDownloader.downloadImage(url, location); 51 | iterator.remove(); 52 | } catch (IOException downloadFail) { 53 | } 54 | } 55 | } 56 | 57 | StringBuilder newLog = new StringBuilder(); 58 | lines.forEach(line -> newLog.append(line).append(System.getProperty("line.separator"))); 59 | FileWriter writer = new FileWriter(logLocation, false); 60 | writer.write(newLog.toString()); 61 | writer.close(); 62 | } catch (IOException e) { 63 | e.printStackTrace(); 64 | } 65 | } 66 | 67 | /** 68 | * 获取当前指定的日志的位置。 69 | * 如果没有指定位置,将默认使用当前路径下的名为{@link BookDownloader#ERROR_LOG_NAME}的文件 70 | * 71 | * @return 当前指定的日志的位置 72 | */ 73 | public String getLogLocation() { 74 | return logLocation; 75 | } 76 | 77 | /** 78 | * 指定输入的日志的位置 79 | * 80 | * @param logLocation 作为输入的日志的位置 81 | */ 82 | public void setLogLocation(String logLocation) { 83 | this.logLocation = logLocation; 84 | } 85 | 86 | /** 87 | * 获取当前指定的错误日志的单行格式 88 | * 89 | * @return 错误日志的单行格式 90 | */ 91 | public Pattern getPattern() { 92 | return pattern; 93 | } 94 | 95 | /** 96 | * 设置日志的单行格式 97 | * 98 | * @param pattern 日志的单行格式 99 | */ 100 | public void setPattern(Pattern pattern) { 101 | this.pattern = pattern; 102 | } 103 | } 104 | -------------------------------------------------------------------------------- /src/main/java/com/njulib/object/Book.java: -------------------------------------------------------------------------------- 1 | package com.njulib.object; 2 | 3 | import com.njulib.object.exception.BookDLException; 4 | import org.jsoup.Jsoup; 5 | import org.jsoup.nodes.Document; 6 | import org.jsoup.nodes.Element; 7 | import org.jsoup.select.Elements; 8 | import com.njulib.spider.BookDownloader; 9 | import com.njulib.spider.NJULib; 10 | import utils.network.MyHttpRequest; 11 | 12 | import java.io.IOException; 13 | import java.net.URLDecoder; 14 | 15 | /** 16 | * 图书。 17 | *

18 | * 对应南京大学馆藏数字化图书平台 中的图书。 19 | * 20 | * @author padeoe 21 | * @Date: 2016/12/08 22 | */ 23 | public class Book { 24 | /** 25 | * 书的id,唯一识别号,是由南京大学馆藏数字化图书平台定义的 26 | */ 27 | private String id; 28 | /** 29 | * 书名,应该总是包含书名号《》 30 | */ 31 | private String name; 32 | private String author; 33 | 34 | /** 35 | * 初始化一个新创建的{@code Book}对象。 36 | *

37 | * 如果你没有足够的参数信息调用该方法创建对象,可调用{@link #getBookFromUrl(String)}通过书本的在线阅读地址获取实例, 38 | * 或者使用{@link com.njulib.spider.BookSearch}中的方法根据书名等字段查询并创建满足条件的的图书实例。 39 | * 40 | * @param id 书本id,需要和南京大学馆藏数字化图书平台服务器一致 41 | */ 42 | public Book(String id) { 43 | this.id = id; 44 | } 45 | 46 | /** 47 | * 获取书本的编号 48 | * 49 | * @return 书本编号 50 | */ 51 | public String getId() { 52 | return id; 53 | } 54 | 55 | /** 56 | * 设置书本编号 57 | * 58 | * @param id 书本编号 59 | */ 60 | public void setId(String id) { 61 | this.id = id; 62 | } 63 | 64 | /** 65 | * 获取书本名 66 | * 67 | * @return 书名,包含书名号《》 68 | */ 69 | public String getName() { 70 | return name; 71 | } 72 | 73 | /** 74 | * 设置书名 75 | * 76 | * @param name 书名 77 | */ 78 | public void setName(String name) { 79 | this.name = name; 80 | } 81 | 82 | /** 83 | * 获取书本作者,可能是null 84 | * 85 | * @return 书本作者 86 | */ 87 | public String getAuthor() { 88 | return author; 89 | } 90 | 91 | /** 92 | * 设置书本作者 93 | * 94 | * @param author 书本作者 95 | */ 96 | public void setAuthor(String author) { 97 | this.author = author; 98 | } 99 | 100 | /** 101 | * 获取书本出版日期 102 | * 103 | * @return 书本出版日期 104 | */ 105 | public String getPublishDate() { 106 | return publishDate; 107 | } 108 | 109 | public void setPublishDate(String publishDate) { 110 | this.publishDate = publishDate; 111 | } 112 | 113 | /** 114 | * 获取书本主题词,可能是null 115 | * 116 | * @return 书本主题词 117 | */ 118 | public String getTheme() { 119 | return theme; 120 | } 121 | 122 | public void setTheme(String theme) { 123 | this.theme = theme; 124 | } 125 | 126 | /** 127 | * 获取书本所在分类 128 | * 129 | * @return 书本所在分类 130 | */ 131 | public BookClass getBookClass() { 132 | return bookClass; 133 | } 134 | 135 | public void setBookClass(BookClass bookClass) { 136 | this.bookClass = bookClass; 137 | } 138 | 139 | /** 140 | * 获取书本所在末级分类 141 | * 142 | * @return 字符串描述所属分类,最末层的分类,用>分割层级, 143 | * 例如“数理科学和化学图书馆>数学>总论复分>总论” 144 | */ 145 | public String getDetailBookClass() { 146 | return detailBookClass; 147 | } 148 | 149 | public void setDetailBookClass(String detailBookClass) { 150 | this.detailBookClass = detailBookClass; 151 | } 152 | 153 | private String publishDate; 154 | private String theme; 155 | /** 156 | * 所属分类 157 | */ 158 | private BookClass bookClass = new RootBookClass(); 159 | /** 160 | * 所属分类的中文描述。 161 | * “>”分割层级, 162 | * 例如“数理科学和化学图书馆>数学>总论复分>总论” 163 | */ 164 | private String detailBookClass; 165 | 166 | public String getCookie() { 167 | return cookie; 168 | } 169 | 170 | void setCookie(String cookie) { 171 | this.cookie = cookie; 172 | } 173 | 174 | private String cookie; 175 | 176 | /** 177 | * 初始化一个新创建的{@code Book}对象。需要{@code Book}的所有属性。 178 | * 如果你没有足够的参数信息调用该方法创建对象,可调用{@link #getBookFromUrl(String)}通过书本的在线阅读地址获取实例, 179 | * 或者使用{@link com.njulib.spider.BookSearch}中的方法根据书名等字段查询并创建满足条件的的图书实例。 180 | * 181 | * @param id {@code Book}的id。该id是服务器命名的 182 | * @param name 书名 183 | * @param author 作者 184 | * @param publishDate 出版日期 185 | * @param theme 主题词 186 | * @param bookClass 书本分类 187 | * @param detailBookClass 书本分类分类名路径 188 | */ 189 | public Book(String id, String name, String author, String publishDate, String theme, BookClass bookClass, String detailBookClass) { 190 | this.id = id; 191 | this.name = name; 192 | this.author = author; 193 | this.publishDate = publishDate; 194 | this.theme = theme; 195 | this.bookClass = bookClass; 196 | this.detailBookClass = detailBookClass; 197 | } 198 | 199 | 200 | /** 201 | * 通过在线阅览的地址来获取{@code Book}对象 202 | * 203 | * @param onlineReadUrl 书本的在线阅读地址 204 | * @return Book对象,仅指定了id 205 | */ 206 | public static Book getBookFromUrl(String onlineReadUrl) { 207 | for (String para : onlineReadUrl.split("&")) { 208 | if (para.startsWith("ssnumber=")) { 209 | Book book = new Book(para.substring(9, para.length())); 210 | book.fillBookInfoByUrl(onlineReadUrl); 211 | return book; 212 | } 213 | } 214 | return null; 215 | } 216 | 217 | /** 218 | * 通过在线阅读页面补全{@code Book}的信息 219 | * 仅可补全{@link #name},{@link #id},{@link #author},{@link #publishDate} 220 | * 221 | * @param url 书本的在线阅读页面 222 | */ 223 | public void fillBookInfoByUrl(String url) { 224 | try { 225 | String html = new BookDownloader(this).getBookViewPageHtml(url); 226 | html = html.replaceAll("", ""); 228 | Document doc = Jsoup.parse(html); 229 | Elements nameNode = doc.getElementsByTag("title"); 230 | this.name = nameNode.text(); 231 | Elements infoNode = doc.getElementsByTag("span").not("[style]"); 232 | for (Element node : infoNode) { 233 | if (node.text().startsWith("作者:")) { 234 | this.author = node.text().substring(3, node.text().length()); 235 | } 236 | if (node.text().startsWith("出版日期:")) { 237 | this.publishDate = node.text().substring(5, node.text().length()); 238 | } 239 | } 240 | } catch (BookDLException e) { 241 | e.printStackTrace(); 242 | } 243 | } 244 | 245 | /** 246 | * 获取书本的在线阅读地址。 247 | * 248 | * @return 书本在线与阅读的URL 249 | * @throws IOException IO错误 250 | */ 251 | public String getbookread() throws IOException { 252 | resetCookie(); 253 | String para = "BID=" + id + "&ReadMode=0&pdfread=0&displaystyle=0"; 254 | String Url = NJULib.baseUrl + "/getbookread?" + para; 255 | String result = MyHttpRequest.getWithCookie(Url, null, cookie, "UTF-8", 1000); 256 | return NJULib.baseUrl + URLDecoder.decode(result, "UTF-8"); 257 | } 258 | 259 | /** 260 | * 重置{@link #cookie} 261 | * 262 | * @throws IOException 重置cookie失败 263 | */ 264 | private void resetCookie() throws IOException { 265 | cookie = (cookie == null) ? NJULib.getSession() : cookie; 266 | } 267 | 268 | @Override 269 | public String toString() { 270 | return "Book{" + 271 | "id='" + id + '\'' + 272 | ", name='" + name + '\'' + 273 | ", author='" + author + '\'' + 274 | ", publishDate='" + publishDate + '\'' + 275 | ", theme='" + theme + '\'' + 276 | ", bookClass='" + bookClass.getPath() + '\'' + 277 | ", detailBookClass='" + detailBookClass + '\'' + 278 | '}'; 279 | } 280 | 281 | /** 282 | * 下载该书。将下载许多图片,书的每一页都是一张png图片。 283 | * 将会在{@code pathname}下创建一个以书名命名的文件夹,并存储所有图片。 284 | * 错误日志将在当前路径下名为"error.log" 285 | */ 286 | public void download() { 287 | BookDownloader bookDownloader = new BookDownloader(this); 288 | bookDownloader.downloadAllImages(); 289 | } 290 | 291 | /** 292 | * 下载该书。将下载许多图片,书的每一页都是一张png图片。 293 | * 将会在{@code pathname}下创建一个以书名命名的文件夹,并存储所有图片。 294 | * 错误日志将在当前路径下名为"error.log" 295 | * 296 | * @param pathname 下载存储目录 297 | * @param threadNumber 下载线程数 298 | */ 299 | public void download(String pathname, int threadNumber) { 300 | BookDownloader bookDownloader = new BookDownloader(this); 301 | bookDownloader.setSavePath(pathname); 302 | bookDownloader.setThreadNumber(threadNumber); 303 | bookDownloader.downloadAllImages(); 304 | } 305 | 306 | /** 307 | * 下载该书。将下载许多图片,书的每一页都是一张png图片。 308 | * 将会在{@code pathname}下创建一个以书名命名的文件夹,并存储所有图片。 309 | * 310 | * @param pathname 下载存储目录 311 | * @param threadNumber 线程数 312 | * @param errorLogPath 错误日志路径 313 | */ 314 | public void download(String pathname, int threadNumber, String errorLogPath) { 315 | BookDownloader bookDownloader = new BookDownloader(this); 316 | bookDownloader.setSavePath(pathname); 317 | bookDownloader.setThreadNumber(threadNumber); 318 | bookDownloader.setErrorLogPath(errorLogPath); 319 | bookDownloader.downloadAllImages(); 320 | } 321 | 322 | @Override 323 | public int hashCode() { 324 | return Integer.parseInt(this.getId()); 325 | } 326 | 327 | @Override 328 | public boolean equals(Object obj) { 329 | if (!(obj instanceof Book)) 330 | return false; 331 | if (obj == this) 332 | return true; 333 | return this.id.equals(((Book) obj).id); 334 | } 335 | } 336 | -------------------------------------------------------------------------------- /src/main/java/com/njulib/object/BookClass.java: -------------------------------------------------------------------------------- 1 | package com.njulib.object; 2 | 3 | import org.jsoup.Jsoup; 4 | import org.jsoup.nodes.Document; 5 | import org.jsoup.nodes.Element; 6 | import org.jsoup.select.Elements; 7 | import com.njulib.spider.BookDownloader; 8 | import com.njulib.spider.NJULib; 9 | import utils.conversion.MyDecoder; 10 | import utils.network.MyHttpRequest; 11 | 12 | import java.io.IOException; 13 | import java.nio.file.Paths; 14 | import java.util.*; 15 | import java.util.concurrent.atomic.AtomicInteger; 16 | import java.util.regex.Matcher; 17 | import java.util.regex.Pattern; 18 | import java.util.stream.Collectors; 19 | 20 | /** 21 | * 图书的分类。 22 | *

23 | * 对应南京大学馆藏数字化图书平台 中的图书。 24 | * 同时分类名和分类编号满足中图法分类。是树结构。具有查询子分类和查询分类下书籍列表,批量下载分类书籍等功能。 25 | * 如果你没有足够信息构造实例,可以通过{@link RootBookClass}查询所有分类来获取实例。 26 | * 或者{@link com.njulib.spider.BookSearch}中的一些方法获取实例。 27 | * 28 | * @author padeoe 29 | * @Date: 2016/12/08 30 | */ 31 | public class BookClass { 32 | /** 33 | * 分类id,服务器定义的中图法分类id, 34 | * 例如"0T0P3010" 35 | */ 36 | private String id; 37 | /** 38 | * 分类名称 39 | */ 40 | private String name; 41 | /** 42 | * 父分类 43 | */ 44 | private BookClass parent; 45 | /** 46 | * 子分类列表 47 | */ 48 | private Map children; 49 | 50 | 51 | /** 52 | * 子分类{@link #children}是否已经被加载 53 | */ 54 | private boolean isLoaded = false; 55 | 56 | /** 57 | * 查看当对象所使用的cookie 58 | * 59 | * @return cookie 60 | */ 61 | public String getCookie() { 62 | return cookie; 63 | } 64 | 65 | /** 66 | * 设置{@code cookie},BookClass的子每一次子分类加载, 67 | * 书籍查询等操作都需要cookie,设置的cookie将会对所有子分类使用, 68 | * 以避免频繁获取cookie 69 | * 70 | * @param cookie cookie 71 | */ 72 | public void setCookie(String cookie) { 73 | this.cookie = cookie; 74 | } 75 | 76 | /** 77 | * 查询分类信息时向服务器发送的cookie,初始=null。 78 | * 当调用了需要网络的方法时,将会被初始化。 79 | * 一个{@link BookClass}对象的所有子分类{@link #children}都是用的同一个cookie 80 | */ 81 | private String cookie; 82 | 83 | /** 84 | * 获取子分类的数量 85 | * 86 | * @return 子分类的数量 87 | */ 88 | public int getChildCount() { 89 | return children.size(); 90 | } 91 | 92 | /** 93 | * 获取父分类 94 | * 95 | * @return 父分类。如果不存在则为null 96 | */ 97 | public BookClass getParent() { 98 | return parent; 99 | } 100 | 101 | 102 | /** 103 | * 获取所有子分类。 104 | * 初始为null,若要查看子分类,必须先调用{@link #loadChild()}或者{@link #loadAllChild()}从服务器查询并加载 105 | * 106 | * @return 子分类的集合 107 | */ 108 | public Set getChildren() { 109 | return children.values().stream().collect(Collectors.toSet()); 110 | } 111 | 112 | /** 113 | * 查询特定子分类。 114 | * 115 | * @param idOrName 子分类的名称或者代号。符合中图法分类。 116 | * @return 子分类 117 | */ 118 | public BookClass getChild(String idOrName) { 119 | return children.get(idOrName); 120 | } 121 | 122 | public String getName() { 123 | return name; 124 | } 125 | 126 | public void setName(String name) { 127 | this.name = name; 128 | } 129 | 130 | public String getId() { 131 | return id; 132 | } 133 | 134 | public void setId(String id) { 135 | this.id = id; 136 | } 137 | 138 | public void setParent(BookClass parent) { 139 | this.parent = parent; 140 | } 141 | 142 | public boolean isTerminal() { 143 | return false; 144 | } 145 | 146 | /** 147 | * 添加一个子分类 148 | * 149 | * @param bookClass 子分类 150 | * @return 如果同id的子分类已存在,则返回之前的子分类,如果不存在,则添加并返回null 151 | */ 152 | public BookClass addChild(BookClass bookClass) { 153 | if (bookClass.name != null) { 154 | children.putIfAbsent(bookClass.name, bookClass); 155 | } 156 | return children.putIfAbsent(bookClass.id, bookClass); 157 | } 158 | 159 | /** 160 | * 创建并初始化一个书本分类。指定分类编号,分类名称和父分类。 161 | * 162 | * @param id 分类编号 163 | * @param name 分类名称 164 | * @param parent 父分类 165 | */ 166 | public BookClass(String id, String name, BookClass parent) { 167 | this.id = id; 168 | this.name = name; 169 | this.parent = parent; 170 | children = new HashMap<>(); 171 | } 172 | 173 | /** 174 | * 创建一个新初始化的{@code BookClass}对象, 175 | * 使之中图法分类标识是{@code id} 176 | * 177 | * @param id 分类的中图法分类标识。 178 | * 需要和南京大学馆藏数字化图书平台定义的格式一致 179 | */ 180 | public BookClass(String id) { 181 | this.id = id; 182 | children = new HashMap<>(); 183 | this.isLoaded = false; 184 | } 185 | 186 | /** 187 | * 加载子分类。仅加载一层子分类,即子分类的子分类不会被加载。 188 | * 当该方法被调用时,会向服务器查询该分类的子分类并更新该对象的{@link #children} 189 | *

190 | * 如需递归加载子分类,调用{@link #loadAllChild()} 191 | * 192 | * @throws IOException 从服务器查询子节点出错 193 | */ 194 | public void loadChild() throws IOException { 195 | if (!isTerminal()) { 196 | checkCookie(); 197 | String Url = NJULib.baseUrl + "/classifyview"; 198 | String data = "fenlei=" + this.getId() + "&lib=markbook"; 199 | String result = MyHttpRequest.postWithCookie(data, Url, null, cookie, "UTF-8", "UTF-8", 1000); 200 | // System.out.println(result); 201 | Document doc = Jsoup.parse(result); 202 | Elements li = doc.getElementsByTag("li"); 203 | for (Element bookClassId : li) { 204 | String id = bookClassId.attr("id"); 205 | String name = bookClassId.getElementsByTag("a").text(); 206 | boolean hasSubTree = bookClassId.getElementsByTag("img").attr("onClick").contains("getSubTree"); 207 | //System.out.println(id+" "+NJULib.decodeUrlUnicode(name)); 208 | BookClass child = hasSubTree ? new BookClass(id, MyDecoder.decodeUrlUnicode(name), this) : 209 | new TerminalBookClass(id, MyDecoder.decodeUrlUnicode(name), this); 210 | child.setCookie(cookie); 211 | this.addChild(child); 212 | } 213 | this.isLoaded = true; 214 | } 215 | } 216 | 217 | 218 | /** 219 | * 迭代加载所有子分类。 220 | * 直至加载到每个分类的末层分类。 221 | * 222 | * @throws IOException 从服务器查询时出错 223 | */ 224 | public void loadAllChild() throws IOException { 225 | if (!isTerminal()) { 226 | loadChild(); 227 | for (BookClass child : getChildren()) { 228 | child.loadAllChild(); 229 | } 230 | } 231 | } 232 | 233 | 234 | /** 235 | * 下载分类下所有图书,会迭代测创建分类文件夹 236 | * 237 | * @param pathname 存储路径。将在该路径下创建多级分类目录并保存下载的图书 238 | * @param threadNumber 线程数 239 | * @param errorLogPath 错误日志路径 240 | * @throws IOException 连接失败的错误 241 | */ 242 | public void downloadWithCataDir(String pathname, int threadNumber, String errorLogPath) throws IOException { 243 | if (!isTerminal()) { 244 | loadChild(); 245 | for (BookClass child : getChildren()) { 246 | child.downloadWithCataDir(Paths.get(pathname, name == null ? id : name).toString(), threadNumber, errorLogPath); 247 | } 248 | } else { 249 | downloadAllBooks(Paths.get(pathname, name == null ? id : name).toString(), threadNumber, errorLogPath); 250 | } 251 | } 252 | 253 | /** 254 | * 下载分类下所有图书,会迭代测创建分类文件夹 255 | * 下载存储路径为当前路径,线程数为5,错误日志将保存在当前路径,文件名为{@link BookDownloader#ERROR_LOG_NAME} 256 | * 可以调用重载{@link #downloadWithCataDir(String, int, String)}设置参数 257 | * 258 | * @throws IOException 连接失败的错误 259 | */ 260 | public void downloadWithCataDir() throws IOException { 261 | downloadWithCataDir(System.getProperty("user.dir"), 5, Paths.get(System.getProperty("user.dir"), BookDownloader.ERROR_LOG_NAME).toString()); 262 | } 263 | 264 | /** 265 | * 从服务器获取该分类下图书列表的第{@code page}页。 266 | * 图书列表的分页是服务器做的,每页最多10条图书。 267 | *

268 | * 页数的最大值可以根据{@link #queryBooksSize()}自行计算 269 | * 270 | * @param page 图书列表的页码 271 | * @return 列表该页记录的图书 272 | * @throws IOException 从服务器查询书本列表时出错 273 | */ 274 | public Set queryBooks(int page) throws IOException { 275 | checkCookie(); 276 | String data = "fenlei=" + this.id + "&mark=all&Page=" + page + "&totalnumber=-1"; 277 | String Url = NJULib.baseUrl + "/markbook/classifybookview.jsp"; 278 | String html = MyHttpRequest.postWithCookie(data, Url, null, cookie, "UTF-8", "GBK", 1000); 279 | // System.out.println(html); 280 | Set books = queryBooks(html); 281 | return books; 282 | 283 | } 284 | 285 | /** 286 | * 获得某分类下的所有图书 287 | * 288 | * @return 分类下所有图书 289 | * @throws IOException 从服务器查询书本列表时出错 290 | */ 291 | public Set queryAllBooks() throws IOException { 292 | return queryAllBooks(5); 293 | } 294 | 295 | /** 296 | * 获得分类下的所有图书 297 | * 298 | * @param threadNumber 线程数 299 | * @return 图书集合 300 | * @throws IOException 连接错误 301 | */ 302 | public Set queryAllBooks(int threadNumber) throws IOException { 303 | checkCookie(); 304 | String data = "fenlei=" + this.id + "&mark=all&Page=1&totalnumber=-1"; 305 | String Url = NJULib.baseUrl + "/markbook/classifybookview.jsp"; 306 | String html = MyHttpRequest.postWithCookie(data, Url, null, cookie, "UTF-8", "GBK", 1000); 307 | // System.out.println(html); 308 | Document doc = Jsoup.parse(html); 309 | Elements form = doc.select("a:contains(末页)"); 310 | 311 | if (!form.isEmpty()) { 312 | String keyword = form.get(0).attr("href"); 313 | String booksize = keyword.substring(keyword.lastIndexOf(",") + 1, keyword.length() - 1); 314 | int size = Integer.parseInt(booksize); 315 | System.out.println("一共 " + size + " 本书"); 316 | Set books = queryBooks(html); 317 | List threadList = new ArrayList<>(); 318 | 319 | AtomicInteger needGettedPage = new AtomicInteger(2);//需要获取的页码 320 | int lastPage = size / 10 + 1;//最后一页的页码 321 | //开始多线程刷所有页码 322 | for (int threadN = 0; threadN < threadNumber; threadN++) { 323 | threadList.add(new PageGetThread(needGettedPage, lastPage)); 324 | } 325 | 326 | for (PageGetThread thread : threadList) { 327 | thread.start(); 328 | } 329 | for (PageGetThread thread : threadList) { 330 | try { 331 | thread.join(); 332 | } catch (InterruptedException e) { 333 | e.printStackTrace(); 334 | } 335 | } 336 | threadList.forEach(pageGetThread -> books.addAll(pageGetThread.getThreadBooks())); 337 | return books; 338 | } 339 | return null; 340 | } 341 | 342 | /** 343 | * 下载分类下所有图书。 344 | * 所有书籍将直接保存在{@code pathname}目录下,每本书一个文件夹,以书名命名。如同名,则加作者名,如又同名,加书本编号 345 | * 346 | * @param pathname 存储路径。书本文件夹所在的上级路径 347 | * @param threadNumber 线程数 348 | * @param errorLogPath 错误日志路径 349 | * @throws IOException 连接失败的错误 350 | */ 351 | public void downloadAllBooks(String pathname, int threadNumber, String errorLogPath) throws IOException { 352 | checkCookie(); 353 | String data = "fenlei=" + this.id + "&mark=all&Page=1&totalnumber=-1"; 354 | String Url = NJULib.baseUrl + "/markbook/classifybookview.jsp"; 355 | String html = MyHttpRequest.postWithCookie(data, Url, null, cookie, "UTF-8", "GBK", 1000); 356 | // System.out.println(html); 357 | Document doc = Jsoup.parse(html); 358 | Elements form = doc.select("a:contains(末页)"); 359 | if (!form.isEmpty()) { 360 | String keyword = form.get(0).attr("href"); 361 | String booksize = keyword.substring(keyword.lastIndexOf(",") + 1, keyword.length() - 1); 362 | int size = Integer.parseInt(booksize); 363 | System.out.println(this.getPath()+"一共 " + size + " 本书"); 364 | Set books = queryBooks(html); 365 | Set downloading; 366 | downloadBooks(books, pathname, threadNumber, errorLogPath); 367 | int lastPage = size / 10 + 1;//最后一页的页码 368 | int index = 1; 369 | for (int i = lastPage; i >= 2; i--) { 370 | downloading = queryBooks(i); 371 | for (Book book : downloading) { 372 | if (books.add(book)) { 373 | book.download(pathname, threadNumber, errorLogPath); 374 | index++; 375 | } else { 376 | System.out.println("服务器返回了重复书籍,跳过 " + book); 377 | } 378 | } 379 | } 380 | System.out.println("去重后共" + books.size() + "书,实际下载了" + (index + 10) + "本书(含失败)"); 381 | } 382 | } 383 | 384 | private void downloadBooks(Set books, String pathname, int threadNumber, String errorLogPath) { 385 | for (Book book : books) { 386 | book.download(pathname, threadNumber, errorLogPath); 387 | } 388 | } 389 | 390 | 391 | /** 392 | * 获取所有图书列表的线程 393 | */ 394 | class PageGetThread extends Thread { 395 | Set books = new HashSet<>(); 396 | AtomicInteger needGettedPage; 397 | int lastPage; 398 | 399 | public PageGetThread(AtomicInteger needGettedPage, int lastPage) { 400 | this.needGettedPage = needGettedPage; 401 | this.lastPage = lastPage; 402 | } 403 | 404 | @Override 405 | public void run() { 406 | while (true) { 407 | int gettingpage = needGettedPage.getAndIncrement(); 408 | if (gettingpage <= lastPage) { 409 | try { 410 | if (gettingpage % 10 == 0) { 411 | resetCookie(); 412 | } 413 | books.addAll(queryBooks(gettingpage)); 414 | } catch (IOException e) { 415 | e.printStackTrace(); 416 | } 417 | } else { 418 | break; 419 | } 420 | } 421 | } 422 | 423 | public Set getThreadBooks() { 424 | return books; 425 | } 426 | } 427 | 428 | 429 | /** 430 | * 获取HTML文本中的书籍并根据其分类添加进当前的分类结构 431 | * 432 | * @param html 服务器特定页面返回的包含书本信息的HTML文本。 433 | * 服务器多个不同页面返回的包含书本信息的HTML中书本信息相关节点的格式都相似。均可调用本函数 434 | * @return HTML中记录的书本 435 | */ 436 | public Set queryBooks(String html) { 437 | Document doc = Jsoup.parse(html); 438 | Elements booksliNode = doc.select("li[style]"); 439 | return queryBooks(booksliNode); 440 | } 441 | 442 | private Set queryBooks(Elements booksliNode) { 443 | Set books = new HashSet<>(); 444 | for (Element element : booksliNode) { 445 | //获取书名和id 446 | String name = null, id = null, author = null, publishDate = null, theme = null, detailBookClass = null; 447 | BookClass bookBookClass; 448 | Elements nameIdNode = element.select("p[class=name]"); 449 | if (nameIdNode != null) { 450 | name = nameIdNode.text(); 451 | Elements idNode = nameIdNode.select("a[onclick]"); 452 | if (idNode != null && idNode.size() > 0) { 453 | String idOnClick = idNode.get(0).attr("onclick"); 454 | int start = idOnClick.indexOf("(") + 1, end = idOnClick.lastIndexOf(","); 455 | if (start != 0 && end != -1) { 456 | id = idOnClick.substring(start, end); 457 | } 458 | } 459 | } 460 | //获取分类 461 | BookClass[] bookClasses = new BookClass[0]; 462 | Elements infoNode = element.select("p[class=info]"); 463 | if (infoNode != null) { 464 | Elements bookInfos = infoNode.select("a"); 465 | if (bookInfos != null && bookInfos.size() > 0) { 466 | Element terminalCataNode = bookInfos.last(); 467 | bookInfos.remove(terminalCataNode); 468 | List tmplist = bookInfos.stream() 469 | .map(bookInfo -> getBookCata(bookInfo, false)) 470 | .filter(Objects::nonNull) 471 | .collect(Collectors.toList()); 472 | BookClass terminalBookClass = getBookCata(terminalCataNode, true); 473 | if (terminalBookClass != null) { 474 | tmplist.add(terminalBookClass); 475 | } 476 | bookClasses = tmplist.toArray(bookClasses); 477 | } 478 | } 479 | bookBookClass = this.link(bookClasses); 480 | 481 | //获取作者,出版日期,主题词,分类 482 | String info = element.text(); 483 | Pattern pattern = Pattern.compile("\\d+\\. (.*) 作者[::](.*) 出版日期[::](\\d+).*?(?:主题词[::](.+))? 分类[::](.*)"); 484 | Matcher matcher = pattern.matcher(info); 485 | while (matcher.find()) { 486 | name = matcher.group(1); 487 | author = matcher.group(2); 488 | publishDate = matcher.group(3); 489 | theme = matcher.group(4); 490 | detailBookClass = matcher.group(5); 491 | } 492 | Pattern minPattern = Pattern.compile(".*(《.*》).*"); 493 | Matcher minMatcher = minPattern.matcher(info); 494 | while (minMatcher.find()) { 495 | name = minMatcher.group(1); 496 | } 497 | 498 | //汇总书本 499 | if (name != null && id != null) { 500 | Book book = new Book(id, name, author, publishDate, theme, bookBookClass, detailBookClass); 501 | book.setCookie(cookie); 502 | books.add(book); 503 | if (bookBookClass.isTerminal()) { 504 | ((TerminalBookClass) bookBookClass).addBook(book); 505 | } else { 506 | System.out.println("未获取到分类信息,将不被归档 " + book); 507 | } 508 | } else { 509 | System.out.println("error: " + info); 510 | } 511 | } 512 | return books; 513 | } 514 | 515 | 516 | /** 517 | * 通过HTML中对应节点获取到书所在分类 518 | * 519 | * @param bookInfo 书本信息的HTML节点 520 | * @param isTerminal 是否是终端分类 521 | * @return 书所在分类。如果是终端分类将会返回{@code TerminalBookClass} 522 | */ 523 | private BookClass getBookCata(Element bookInfo, boolean isTerminal) { 524 | String cataName = bookInfo.text(); 525 | String href = bookInfo.attr("href"); 526 | if (href != null) { 527 | int cataIdStart = href.indexOf('=') + 1; 528 | if (cataIdStart != 0) { 529 | String cataId = href.substring(href.indexOf('=') + 1, href.length()); 530 | BookClass tmp = isTerminal ? new TerminalBookClass(cataId) : new BookClass(cataId); 531 | tmp.setName(cataName); 532 | return tmp; 533 | } 534 | 535 | } 536 | return null; 537 | } 538 | 539 | 540 | /** 541 | * 从服务器查询当前分类下图书的数量。包含所有子分类下的图书 542 | * 543 | * @return 当前分类下图书的数量 544 | * @throws IOException 查询失败 545 | */ 546 | public int queryBooksSize() throws IOException { 547 | checkCookie(); 548 | String data = "fenlei=" + this.getId() + "&mark=all&Page=1&totalnumber=-1"; 549 | String Url = NJULib.baseUrl + "/markbook/classifybookview.jsp"; 550 | String html = MyHttpRequest.postWithCookie(data, Url, null, cookie, "UTF-8", "GBK", 1000); 551 | // System.out.println(html); 552 | Document doc = Jsoup.parse(html); 553 | Elements form = doc.select("input[name=totalnumber]"); 554 | if (!form.isEmpty()) { 555 | String booksize = form.get(0).attr("value"); 556 | return Integer.parseInt(booksize); 557 | } 558 | return 0; 559 | } 560 | 561 | 562 | /** 563 | * 检查{@code cookie}如果为null将会更新cookie 564 | * 565 | * @throws IOException 更新cookie失败 566 | */ 567 | private void checkCookie() throws IOException { 568 | cookie = (cookie == null) ? NJULib.getSession() : cookie; 569 | } 570 | 571 | /** 572 | * 重置{@code cookie} 573 | * 574 | * @throws IOException 重置cookie失败 575 | */ 576 | private void resetCookie() throws IOException { 577 | cookie = NJULib.getSession(); 578 | } 579 | 580 | /** 581 | * 对当前分类添加子分类 582 | * 583 | * @param childBookClasses 顺次路径关系子分类,后一个是前一个的子分类。第一个是当前分类的子分类 584 | * @return 子分类的最后一级分类.若子路径参数为空,则为当前分类 585 | */ 586 | public BookClass link(BookClass... childBookClasses) { 587 | BookClass currentBookClass = this; 588 | for (BookClass bookClass : childBookClasses) { 589 | BookClass previois = currentBookClass.addChild(bookClass); 590 | if (previois != null) { 591 | currentBookClass = previois; 592 | } else { 593 | bookClass.parent = currentBookClass; 594 | currentBookClass = bookClass; 595 | } 596 | } 597 | return currentBookClass; 598 | } 599 | 600 | /** 601 | * 获取分类对象所有终端分类下已存储的书籍 602 | *

603 | * 不会触发网络请求,只是迭代收集子分类的下已存在的书籍。 604 | * 如要即时从服务器查询书籍,请调用{@link #queryAllBooks()}及其重载 605 | * 606 | * @return 该分类下属所有分类的图书集合 607 | */ 608 | public Set getBooks() { 609 | return this.getChildren().stream().map(BookClass::getBooks).collect(HashSet::new, Set::addAll, Set::addAll); 610 | } 611 | 612 | 613 | /** 614 | * 判断两个{@code BookClass}是否是同一个分类。 615 | * 仅根据代号即{@link BookClass#id}来判断 616 | * 617 | * @param obj 任意对象 618 | * @return 对象是否是同一个分类 619 | */ 620 | @Override 621 | public boolean equals(Object obj) { 622 | if (!(obj instanceof BookClass)) 623 | return false; 624 | if (obj == this) 625 | return true; 626 | return this.id.equals(((BookClass) obj).id); 627 | } 628 | 629 | /** 630 | * 获取分类所在的路径。 631 | * 返回可读的{@code String},对二级分类到当前分类顺次所经路径分别调用{@link BookClass#toString()},用"-"分割 632 | * 633 | * @return 从二级分类到当前分类顺次所经路径,用"-"分隔分类 634 | */ 635 | public String getPath() { 636 | Stack parents = new Stack<>(); 637 | BookClass bookClass = this; 638 | while (bookClass!=null&&!bookClass.isRoot()) { 639 | parents.push(bookClass); 640 | bookClass = bookClass.getParent(); 641 | } 642 | StringBuilder sb = new StringBuilder(); 643 | if (!parents.isEmpty()) { 644 | sb.append(parents.pop().toString()); 645 | } 646 | while (!parents.isEmpty()) { 647 | sb.append("-"); 648 | sb.append(parents.pop().toString()); 649 | } 650 | return sb.toString(); 651 | } 652 | 653 | /** 654 | * 用于判断{@link BookClass}对象是不是{@link RootBookClass}的实例 655 | * 656 | * @return 是否是根分类 657 | */ 658 | public boolean isRoot() { 659 | return false; 660 | } 661 | 662 | /** 663 | * 返回{@code BookClass}的哈希值。 664 | * 会直接使用用{@link #id}的哈希值 665 | * 666 | * @return 哈希值 667 | */ 668 | @Override 669 | public int hashCode() { 670 | return id.hashCode(); 671 | } 672 | 673 | /** 674 | * 返回{@code BookClass}的可读字符串描述。 675 | * 676 | * @return 格式是 "分类代号(分类名)",如果分类名为null,则格式是"分类代号" 677 | */ 678 | @Override 679 | public String toString() { 680 | return this.getId() + (this.getName() == null ? "" : "(" + this.getName() + ")"); 681 | } 682 | } 683 | -------------------------------------------------------------------------------- /src/main/java/com/njulib/object/Books.java: -------------------------------------------------------------------------------- 1 | package com.njulib.object; 2 | 3 | import java.util.Set; 4 | 5 | /** 6 | * 书本查询的结果。{@link com.njulib.spider.BookSearch}类某些方法的返回值用到本类 7 | * 包含了查询出的图书当前页集合,以及查询结果的总页数,书本总数。 8 | * 9 | * @author padeoe 10 | * @Date: 2016/12/09 11 | */ 12 | public class Books { 13 | private int page; 14 | private int totalNums; 15 | private int totalPage; 16 | private Set bookSet; 17 | 18 | /** 19 | * @param page 当前页数 20 | * @param totalPage 总页数 21 | * @param totalNums 总书本数 22 | * @param bookSet 本页的书 23 | */ 24 | public Books(int page, int totalPage, int totalNums, Set bookSet) { 25 | this.totalPage = totalPage; 26 | this.bookSet = bookSet; 27 | } 28 | 29 | /** 30 | * 获取查询到的图书总数 31 | * 32 | * @return 查询到的图书总数 33 | */ 34 | public int getTotalNums() { 35 | return totalNums; 36 | } 37 | 38 | public int getPage() { 39 | return page; 40 | } 41 | 42 | public int getTotalPage() { 43 | return totalPage; 44 | } 45 | 46 | public Set getBookSet() { 47 | return bookSet; 48 | } 49 | } 50 | -------------------------------------------------------------------------------- /src/main/java/com/njulib/object/InfoReader.java: -------------------------------------------------------------------------------- 1 | package com.njulib.object; 2 | 3 | import com.njulib.spider.BookDownloader; 4 | 5 | import java.io.IOException; 6 | import java.nio.charset.Charset; 7 | import java.nio.file.Files; 8 | import java.nio.file.Paths; 9 | import java.util.List; 10 | import java.util.regex.Matcher; 11 | import java.util.regex.Pattern; 12 | 13 | /** 14 | * info文件解析器。 15 | *

16 | * info文件是由{@link BookDownloader}在下载过程中创建的文本文件。 17 | * 记录了一个{@link Book#toString()} 18 | * 默认名称是{@link BookDownloader#INFO_FILE_NAME}。 19 | * 该类会读取info文件并解析出{@link Book}对象 20 | * 21 | * @author padeoe 22 | * @Date: 2016/12/11 23 | */ 24 | public class InfoReader { 25 | private String infoFilePath; 26 | 27 | public InfoReader(String infoFilePath) { 28 | this.infoFilePath = infoFilePath; 29 | } 30 | 31 | /** 32 | * 解析{@code Book}对象,如果未找到返回null 33 | * 34 | * @return {@code Book}对象 35 | */ 36 | public Book read() { 37 | try { 38 | List lines = Files.readAllLines(Paths.get(infoFilePath), Charset.forName("UTF-8") ); 39 | String info = ""; 40 | if (lines.size() > 0) { 41 | info = lines.get(0); 42 | } 43 | Pattern pattern = Pattern.compile("Book\\{id='(.*)', name='(.*)', author='(.*)', publishDate='(.*)', theme='(.*)', bookClass=(.*), detailBookClass='(.*)'\\}"); 44 | Matcher matcher = pattern.matcher(info); 45 | if (matcher.find()) { 46 | return new Book(matcher.group(1), 47 | matcher.group(2), 48 | matcher.group(3), 49 | matcher.group(4), 50 | matcher.group(5), 51 | new BookClass(matcher.group(6)), 52 | matcher.group(7)); 53 | } 54 | return null; 55 | } catch (IOException e) { 56 | e.printStackTrace(); 57 | return null; 58 | } 59 | } 60 | } 61 | -------------------------------------------------------------------------------- /src/main/java/com/njulib/object/RootBookClass.java: -------------------------------------------------------------------------------- 1 | package com.njulib.object; 2 | 3 | /** 4 | * 根分类 5 | *

6 | * 根分类是在中图法分类之外虚拟出的分类。 7 | * 用于集合管理所有子分类,以及作为起点,从服务器获取子分类。 8 | * 9 | * @author padeoe 10 | * @Date: 2016/12/20 11 | */ 12 | public class RootBookClass extends BookClass { 13 | public RootBookClass() { 14 | super("all"); 15 | } 16 | 17 | /** 18 | * 用于判断{@link BookClass}对象是不是{@link RootBookClass}的实例 19 | * 20 | * @return true 21 | */ 22 | @Override 23 | public boolean isRoot() { 24 | return true; 25 | } 26 | 27 | /** 28 | * 用于判断{@link BookClass}对象是不是{@link TerminalBookClass}的实例 29 | * 30 | * @return false 31 | */ 32 | @Override 33 | public boolean isTerminal() { 34 | return false; 35 | } 36 | } 37 | -------------------------------------------------------------------------------- /src/main/java/com/njulib/object/TerminalBookClass.java: -------------------------------------------------------------------------------- 1 | package com.njulib.object; 2 | 3 | import java.util.HashSet; 4 | import java.util.Set; 5 | 6 | /** 7 | * 终端分类。即分类的最末层。 8 | *

9 | * 采用的是中图法分类,例如"哲学宗教-哲学理论-辩证唯物主义-总论"的最后一个"总论"就是一个终端分类。 10 | * 只有终端分类下可以存储图书。 11 | * 12 | * @author padeoe 13 | * @Date: 2016/12/20 14 | */ 15 | public class TerminalBookClass extends BookClass { 16 | private Set books = new HashSet<>(); 17 | 18 | /** 19 | * 创建一个新初始化的{@code BookClass}对象, 20 | * 使之中图法分类标识是{@code id} 21 | * 22 | * @param id 分类的中图法分类标识。 23 | * 需要和南京大学馆藏数字化图书平台定义的格式一致 24 | */ 25 | public TerminalBookClass(String id) { 26 | super(id); 27 | } 28 | 29 | 30 | /** 31 | * 构造函数。 32 | * 33 | * @param id 分类编号 34 | * @param name 分类名 35 | * @param parent 父分类 36 | */ 37 | public TerminalBookClass(String id, String name, BookClass parent) { 38 | super(id, name, parent); 39 | } 40 | 41 | /** 42 | * 获取分类下的书籍 43 | * 该方法只是返回该分类下现有书籍,不会向服务器查询该分类下所有图书。 44 | * 如需向服务器查询,请调用{@link BookClass#queryAllBooks()}及其重载方法 45 | * 46 | * @return 分类下的书籍。 47 | */ 48 | public Set getBooks() { 49 | return books; 50 | } 51 | 52 | /** 53 | * 用于判断{@link BookClass}对象是不是{@link TerminalBookClass}的实例 54 | * 55 | * @return true 56 | */ 57 | @Override 58 | public boolean isTerminal() { 59 | return true; 60 | } 61 | 62 | 63 | /** 64 | * 用于判断{@link BookClass}对象是不是{@link RootBookClass}的实例 65 | * 66 | * @return false 67 | */ 68 | @Override 69 | public boolean isRoot() { 70 | return false; 71 | } 72 | 73 | /** 74 | * 增加分类下图书 75 | * 76 | * @param book 图书 77 | * @return 如果分类下已有该图书,将返回false。如果没有,将添加并返回true 78 | */ 79 | public boolean addBook(Book book) { 80 | return books.add(book); 81 | } 82 | } 83 | -------------------------------------------------------------------------------- /src/main/java/com/njulib/object/exception/BookDLException.java: -------------------------------------------------------------------------------- 1 | package com.njulib.object.exception; 2 | 3 | import com.njulib.object.Book; 4 | 5 | /** 6 | * 下载某一本书时发生错误。此异常发生在该书对应的文件夹创建之前。因此此书没有任何文件被下载。 7 | * 8 | * @author padeoe 9 | * Date: 2016/12/12 10 | */ 11 | public class BookDLException extends Exception { 12 | /** 13 | * 发生下载错误的书籍 14 | */ 15 | private Book book; 16 | 17 | /** 18 | * 创意一个初始化的{@code BookDLException},并指定发生错误的书籍。 19 | * 20 | * @param book 发生下载错误的书籍 21 | */ 22 | public BookDLException(Book book) { 23 | this.book = book; 24 | } 25 | 26 | /** 27 | * 获取发生下载错误的书籍 28 | * 29 | * @return 发生下载错误的书籍 30 | */ 31 | public Book getBook() { 32 | return book; 33 | } 34 | } 35 | -------------------------------------------------------------------------------- /src/main/java/com/njulib/object/exception/BookPagesDLException.java: -------------------------------------------------------------------------------- 1 | package com.njulib.object.exception; 2 | 3 | import java.util.Vector; 4 | 5 | /** 6 | * 下载某一本书时发生错误。 7 | *

8 | * 此异常发生在书本对应文件夹已经创建之后。 9 | * 包含了此书所有的书页下载错误{@code PageDLException},用于错误恢复 10 | * 11 | * @author padeoe 12 | * Date: 2016/12/10 13 | */ 14 | public class BookPagesDLException extends Exception { 15 | Vector pageDLExceptions; 16 | 17 | /** 18 | * 构造一个{@code BookPagesDLException},用此书所有的书页下载错误初始化 19 | * 20 | * @param pageDLExceptionList 此书所有的书页下载错误 21 | */ 22 | public BookPagesDLException(Vector pageDLExceptionList) { 23 | this.pageDLExceptions = pageDLExceptionList; 24 | } 25 | 26 | /** 27 | * 获取页错误的集合 28 | * 29 | * @return 此书所有的书页下载错误{@code PageDLException} 30 | */ 31 | public Vector getPageDLExceptions() { 32 | return pageDLExceptions; 33 | } 34 | } 35 | -------------------------------------------------------------------------------- /src/main/java/com/njulib/object/exception/PageDLException.java: -------------------------------------------------------------------------------- 1 | package com.njulib.object.exception; 2 | 3 | /** 4 | * 下载图书的某一页时失败。 5 | *

6 | * 该类包含了错误现场的信息,可用于错误恢复与后期处理 7 | * 8 | * @author padeoe 9 | * Date: 2016/12/10 10 | */ 11 | public class PageDLException extends Exception { 12 | private String url; 13 | private String location; 14 | 15 | /** 16 | * 创建并初始化一个{@code PageDLException}对象。指定下载地址和存储地址。 17 | * 18 | * @param url 出错页图片的网络地址 19 | * @param location 出错页图片本应存储的本地路径。不含图片后缀名 20 | */ 21 | public PageDLException(String url, String location) { 22 | super(); 23 | this.url = url; 24 | this.location = location; 25 | } 26 | 27 | /** 28 | * 获取出错页的URL 29 | * 30 | * @return 出错页的URL 31 | */ 32 | public String getUrl() { 33 | return url; 34 | } 35 | 36 | /** 37 | * 获取出错页图片本应存储的本地路径。 38 | * 39 | * @return 出错页图片本应存储的本地路径。不含图片后缀名 40 | */ 41 | public String getLocation() { 42 | return location; 43 | } 44 | 45 | @Override 46 | public String toString() { 47 | return "PageDLException{" + 48 | "url='" + url + '\'' + 49 | ", location='" + location + '\'' + 50 | '}'; 51 | } 52 | } 53 | -------------------------------------------------------------------------------- /src/main/java/com/njulib/spider/BookDownloader.java: -------------------------------------------------------------------------------- 1 | package com.njulib.spider; 2 | 3 | import com.njulib.fix.MissingPageCompletion; 4 | import com.njulib.object.Book; 5 | import com.njulib.object.InfoReader; 6 | import com.njulib.object.exception.BookDLException; 7 | import com.njulib.object.exception.BookPagesDLException; 8 | import com.njulib.object.exception.PageDLException; 9 | import org.jsoup.Jsoup; 10 | import org.jsoup.nodes.Document; 11 | import org.jsoup.nodes.Element; 12 | import utils.network.MyHttpRequest; 13 | import utils.network.ReturnData; 14 | 15 | import java.io.*; 16 | import java.nio.charset.StandardCharsets; 17 | import java.nio.file.Path; 18 | import java.nio.file.Paths; 19 | import java.util.*; 20 | import java.util.concurrent.atomic.AtomicInteger; 21 | import java.util.regex.Matcher; 22 | import java.util.regex.Pattern; 23 | 24 | /** 25 | * 书本的下载器,分离了下载相关的函数及变量。 26 | * 27 | * @author padeoe 28 | * Date: 2016/12/09 29 | */ 30 | public class BookDownloader { 31 | private String errorLogPath = ERROR_LOG_NAME; 32 | private int threadNumber = 5; 33 | 34 | /** 35 | * 获取下载器对应的{@code Book} 36 | * 37 | * @return 下载器对应的{@code Book} 38 | */ 39 | public Book getBook() { 40 | return book; 41 | } 42 | 43 | private Book book; 44 | private Map pageNumberMap; 45 | private String savePath = System.getProperty("user.dir"); 46 | private Path directory; 47 | private String urlPrefix; 48 | private PageType[] pageTypes = {PageType.COVER, PageType.BOOKNAME, PageType.LEGALINFO, PageType.INTRODUCTION, 49 | PageType.DIRECTORY, PageType.CONTENT, PageType.APPENDIX, PageType.BACKCOVER}; 50 | private AtomicInteger needDownload = new AtomicInteger(1); 51 | 52 | /** 53 | * 获取{@code Book}的页组成结构。 54 | * 55 | * @return 记录了每种{@link PageType}的数量。 56 | * @throws BookDLException 页组成获取失败,书本下载放弃 57 | */ 58 | public Map getPageNumberMap() throws BookDLException { 59 | if (pageNumberMap == null) { 60 | initialBookPara(); 61 | return pageNumberMap; 62 | } 63 | return pageNumberMap; 64 | } 65 | 66 | /** 67 | * 获取{@code Book}图片的URL前缀 68 | * 69 | * @return {@code Book}图片的URL前缀 70 | * @throws BookDLException 前缀获取失败,书本下载被放弃。 71 | */ 72 | public String getUrlPrefix() throws BookDLException { 73 | if (urlPrefix == null) { 74 | initialBookPara(); 75 | return urlPrefix; 76 | } 77 | return urlPrefix; 78 | } 79 | 80 | /** 81 | * 错误日志的默认文件名 82 | */ 83 | public static final String ERROR_LOG_NAME = "error.log"; 84 | /** 85 | * 书本信息记录的默认文件名 86 | */ 87 | public static final String INFO_FILE_NAME = "info.txt"; 88 | 89 | /** 90 | * 创建指定{@code book}的下载器 91 | * 92 | * @param book 指定的书本 93 | */ 94 | public BookDownloader(Book book) { 95 | this.book = book; 96 | } 97 | 98 | /** 99 | * 创建指定{@code Book}的下载器,将根据{@code bookid}创建{@link Book}对象 100 | * 101 | * @param bookid 书本id 102 | */ 103 | BookDownloader(String bookid) { 104 | this.book = new Book(bookid); 105 | } 106 | 107 | /** 108 | * 查看下载线程数 109 | * 110 | * @return 当前指定的下载线程数。默认为5 111 | */ 112 | public int getThreadNumber() { 113 | return threadNumber; 114 | } 115 | 116 | /** 117 | * 设置下载线程数。书本与书本之间将会依次单线程下载。书本的所有页将会采用多线程下载。 118 | * 119 | * @param threadNumber 线程数 120 | */ 121 | public void setThreadNumber(int threadNumber) { 122 | this.threadNumber = threadNumber; 123 | } 124 | 125 | /** 126 | * 设置文件夹名 127 | * 128 | * @param directoryString 文件夹名 129 | */ 130 | public void setDirectory(String directoryString) { 131 | String directoryName = directoryString.replaceAll("[/\\\\:\"*?<>|]", " "); 132 | directory = Paths.get(savePath, directoryName); 133 | } 134 | 135 | /** 136 | * 设置保存路径 137 | * 138 | * @param savePath 下载保存路径 139 | */ 140 | public void setSavePath(String savePath) { 141 | this.savePath = savePath; 142 | } 143 | 144 | /** 145 | * 下载图片 146 | * 147 | * @param url 图片的url 148 | * @param pathname 保存的路径,包括文件名(不含图片后缀),例如"C:/Users/username/a",函数执行后会保存为"C:/Users/username/a.png" 149 | * @throws IOException 下载出错 150 | */ 151 | public static void downloadImage(String url, String pathname) throws IOException { 152 | ReturnData returnData = MyHttpRequest.action_returnbyte("GET", null, url, null, null, null, 2000); 153 | byte[] a = returnData.getData(); 154 | List types = returnData.getHeaders().get("Content-Type"); 155 | String suffix = ".png"; 156 | if (types != null && types.get(0) != null) { 157 | suffix = types.get(0).substring(types.get(0).indexOf('/') + 1, types.get(0).length()).toLowerCase(); 158 | suffix = suffix.equals("jpeg") ? ".jpg" : (suffix.equals("png") ? ".png" : suffix); 159 | } 160 | File file = new File(pathname + suffix); 161 | BufferedOutputStream bf = new BufferedOutputStream(new FileOutputStream(file)); 162 | bf.write(a, 0, a.length); 163 | bf.close(); 164 | } 165 | 166 | 167 | /** 168 | * 初始化下载参数,从服务器查询书本下载所需的参数,包括书页url,书本页数,页类型 169 | * 执行后 170 | * 171 | * @throws BookDLException 查询参数出错,书本下载被终止 172 | */ 173 | private void initialBookPara() throws BookDLException { 174 | //获取页面地址 175 | String url; 176 | try { 177 | url = book.getbookread(); 178 | getBookPara(url); 179 | } catch (IOException e) { 180 | e.printStackTrace(); 181 | throw new BookDLException(book); 182 | } 183 | } 184 | 185 | public String getBookViewPageHtml(String url) throws BookDLException { 186 | if (url == null || url.length() == 0) { 187 | throw new BookDLException(book); 188 | } 189 | //获取书本参数,包括下载地址前缀,页数 190 | String html; 191 | try { 192 | html = MyHttpRequest.get(url, null, "UTF-8", 2000); 193 | } catch (IOException e) { 194 | e.printStackTrace(); 195 | throw new BookDLException(book); 196 | } 197 | return html; 198 | } 199 | 200 | private void getBookPara(String url) throws BookDLException { 201 | String html = getBookViewPageHtml(url); 202 | Document doc = Jsoup.parse(html); 203 | Element infoNode = doc.getElementsByTag("script").last(); 204 | pageNumberMap = new HashMap<>(); 205 | int epage = 0; 206 | if (infoNode.dataNodes().size() > 0) { 207 | String paraJs = infoNode.dataNodes().get(0).getWholeData(); 208 | Pattern pattern = Pattern.compile("var str='(.*)';.*epage = (\\d+);.*pages :\\[\\[1,(\\d+)\\],\\[1,(\\d+)\\],\\[1,(\\d+)\\]," + 209 | "\\[1,(\\d+)\\], \\[1,(\\d+)\\], \\[spage, epage\\], \\[1,(\\d+)\\], \\[1,(\\d+)\\]\\],.*", Pattern.DOTALL); 210 | Matcher matcher = pattern.matcher(paraJs); 211 | if (matcher.find()) { 212 | urlPrefix = matcher.group(1); 213 | pageNumberMap.put(pageTypes[5], Integer.parseInt(matcher.group(2))); 214 | pageNumberMap.put(pageTypes[0], Integer.parseInt(matcher.group(3))); 215 | pageNumberMap.put(pageTypes[1], Integer.parseInt(matcher.group(4))); 216 | pageNumberMap.put(pageTypes[2], Integer.parseInt(matcher.group(5))); 217 | pageNumberMap.put(pageTypes[3], Integer.parseInt(matcher.group(6))); 218 | pageNumberMap.put(pageTypes[4], Integer.parseInt(matcher.group(7))); 219 | pageNumberMap.put(pageTypes[6], Integer.parseInt(matcher.group(8))); 220 | pageNumberMap.put(pageTypes[7], Integer.parseInt(matcher.group(9))); 221 | } else { 222 | throw new BookDLException(book); 223 | } 224 | } else { 225 | System.out.println(book.getId() + " 参数获取失败"); 226 | throw new BookDLException(book); 227 | } 228 | } 229 | 230 | /** 231 | * 通过书页页数判断是否是同一本书,如果是则补全info文件,如果不是则不执行操作 232 | * 233 | * @throws BookDLException 从服务器查询书本参数时出错 234 | */ 235 | private void checkOldDirByPageSize() throws BookDLException { 236 | File[] oldfiles = directory.toFile().listFiles(); 237 | int oldBookSize = oldfiles == null ? 0 : oldfiles.length; 238 | //查询当前书本的页数 239 | initialBookPara(); 240 | int newBookSize = pageNumberMap.values().stream().mapToInt(number -> number).sum(); 241 | //若书页数相同,假定为同一本书,帮他补全info文件 242 | if (oldBookSize == newBookSize) { 243 | logBookInfo(); 244 | System.out.println("已存在,跳过并补全了info文件" + book.toString()); 245 | } 246 | } 247 | 248 | /** 249 | * 开始创建文件夹并下载,该函数调用前保存路径以及文件夹名必须已经设置完毕。该环节有多个出口: 250 | * 如果文件夹存在,将会调用{@link #handleOldDir()}进行下一步处理 251 | * 如果文件夹不存在,将会初始化参数并调用{@link #downloadFromParaSetDone()} 进行下一步处理 252 | * 253 | * @throws BookPagesDLException 书本下载过程中发生了缺页 254 | * @throws BookDLException 书本下载未开始 255 | */ 256 | private void downloadFromMkdir() throws BookPagesDLException, BookDLException { 257 | File path = directory.toFile(); 258 | //若目录存在,进入目录存在的处理例程 259 | if (path.exists()) { 260 | handleOldDir(); 261 | return; 262 | } 263 | //目录不存,准备下载。首先获取下载参数 264 | System.out.println("开始下载 " + book); 265 | //获取书本参数 266 | initialBookPara(); 267 | if (!path.mkdirs()) { 268 | System.out.println("文件夹创建失败"); 269 | throw new BookDLException(book); 270 | } 271 | downloadFromParaSetDone(); 272 | } 273 | 274 | /** 275 | * 书本参数已经从服务器获取完毕,直接进行下载并保存。 276 | * 277 | * @throws BookPagesDLException 书本的某些页下载失败 278 | */ 279 | private void downloadFromParaSetDone() throws BookPagesDLException { 280 | Vector pageDLExceptions = new Vector<>(); 281 | //首先顺序下载非正文内容 282 | for (int i = 0; i < pageTypes.length; i++) { 283 | if (i != 5) { 284 | try { 285 | download(pageTypes[i]); 286 | } catch (BookPagesDLException e) { 287 | pageDLExceptions.addAll(e.getPageDLExceptions()); 288 | } 289 | } 290 | } 291 | try { 292 | downloadContent(); 293 | //日志记录书本信息 294 | logBookInfo(); 295 | } catch (BookPagesDLException e) { 296 | pageDLExceptions.addAll(e.getPageDLExceptions()); 297 | } 298 | if (!pageDLExceptions.isEmpty()) { 299 | throw new BookPagesDLException(pageDLExceptions); 300 | } 301 | } 302 | 303 | /** 304 | * 下载文件夹已存在的处理函数。该函数会读取旧的文件夹下的info文件来判断待下载是不是同一本书。 305 | * 该步骤有多个出口: 306 | * 如果info文件不存在或效,将调用{@link #checkOldDirByPageSize()}做进一步判断 307 | * 如果info文件存在且有效,读取info中书本id比对是否是同一本书: 308 | * 如果是同一本书,将跳过;如果不是同一本书,将重新设置保存路径和文件夹名,并调用{@link #downloadFromMkdir()}进行下一步处理 309 | * 310 | * @throws BookPagesDLException 书本下载过程中发生了缺页 311 | * @throws BookDLException 书本下载未开始 312 | */ 313 | private void handleOldDir() throws BookPagesDLException, BookDLException { 314 | //开始检查是否真的是重复还是同名而已,根据书的id判断 315 | //读取info文件 316 | Path infoFilePath = directory.resolve(INFO_FILE_NAME); 317 | File infoFile = infoFilePath.toFile(); 318 | if (infoFile.exists()) { 319 | //info文件存在,读取info文件记录的书本id 320 | Book oldbook; 321 | oldbook = new InfoReader(infoFilePath.toString()).read(); 322 | //读出了旧的书本信息 323 | if (oldbook != null) { 324 | String oldBookId = oldbook.getId(); 325 | //两本书是同一本书 326 | if (oldBookId.equals(book.getId())) { 327 | System.out.println("已存在,跳过" + book.toString()); 328 | return; 329 | } 330 | //两本书是不同的书 331 | else { 332 | //如果两本书作者不同,文件夹添加作者名进行命名,并开始下载 333 | if (!book.getAuthor().equals(oldbook.getAuthor())) { 334 | setDirectory(book.getName() + "-" + book.getAuthor()); 335 | downloadFromMkdir(); 336 | } 337 | //如果两本书作者相同,用作者名加id命名 338 | else { 339 | setDirectory(book.getName() + "-" + book.getAuthor() + "-" + book.getId()); 340 | downloadFromMkdir(); 341 | } 342 | } 343 | } 344 | //info文件格式不正确,没有读出信息 345 | //假定就文件夹是一本旧的书目,文件夹添加作者名进行命名,并开始下载 346 | else { 347 | setDirectory(book.getName() + "-" + book.getAuthor()); 348 | downloadFromMkdir(); 349 | } 350 | } else { 351 | //info文件不存在,比对书本页数数量是否是同一本书决定下一步操作 352 | // checkOldDirByPageSize(); 353 | System.out.println("将删除没有info文件的目录"+directory.getFileName()); 354 | if(deleteDir(directory.toFile())){ 355 | downloadFromMkdir(); 356 | } 357 | else{ 358 | throw new BookDLException(this.book); 359 | } 360 | 361 | } 362 | } 363 | 364 | /** 365 | * 递归删除目录下的所有文件及子目录下所有文件 366 | * @param dir 将要删除的文件目录 367 | * @return boolean Returns "true" if all deletions were successful. 368 | * If a deletion fails, the method stops attempting to 369 | * delete and returns "false". 370 | */ 371 | public static boolean deleteDir(File dir) { 372 | if (dir.isDirectory()) { 373 | String[] children = dir.list(); 374 | //递归删除目录中的子目录下 375 | for (int i=0; i pageDLExceptions = new Vector<>(); 428 | ArrayList threadArrayList = new ArrayList<>(); 429 | for (int i = 0; i < threadNumber; i++) { 430 | threadArrayList.add(new Thread() { 431 | @Override 432 | public void run() { 433 | super.run(); 434 | while (true) { 435 | int downloading = needDownload.getAndIncrement(); 436 | if (downloading <= pageSize) { 437 | //System.out.println("假装在下载 "+downloading); 438 | try { 439 | download(PageType.CONTENT, downloading, String.format("%04d", firstPage + downloading - 1)); 440 | } catch (PageDLException e) { 441 | pageDLExceptions.add(e); 442 | } 443 | } else { 444 | break; 445 | } 446 | } 447 | } 448 | }); 449 | } 450 | for (Thread thread : threadArrayList) { 451 | thread.start(); 452 | } 453 | for (Thread thread : threadArrayList) { 454 | try { 455 | thread.join(); 456 | } catch (InterruptedException e) { 457 | e.printStackTrace(); 458 | } 459 | } 460 | if (!pageDLExceptions.isEmpty()) { 461 | throw new BookPagesDLException(pageDLExceptions); 462 | } 463 | } 464 | 465 | /** 466 | * 下载某一种页类型的所有页。 467 | * 468 | * @param pageType 页类型 469 | * @throws BookPagesDLException 某些页下载失败 470 | */ 471 | private void download(PageType pageType) throws BookPagesDLException { 472 | Vector pageDLExceptions = new Vector<>(); 473 | int base = getFirstPage(pageType); 474 | for (int i = 0; i < pageNumberMap.get(pageType); i++) { 475 | try { 476 | download(pageType, i + 1, String.format("%04d", base + i)); 477 | } catch (PageDLException e) { 478 | pageDLExceptions.add(e); 479 | } 480 | } 481 | if (!pageDLExceptions.isEmpty()) { 482 | throw new BookPagesDLException(pageDLExceptions); 483 | } 484 | } 485 | 486 | /** 487 | * 获取某一种类型页的第一页页码 488 | * 489 | * @param pageType 书页类型 490 | * @return 相对于整本书的页码 491 | */ 492 | private int getFirstPage(PageType pageType) { 493 | int base = 1;//该种类型页的第一页的页码 494 | for (PageType pageType1 : pageTypes) { 495 | if (pageType1.equals(pageType)) { 496 | break; 497 | } else { 498 | base += pageNumberMap.get(pageType1); 499 | } 500 | } 501 | return base; 502 | } 503 | 504 | /** 505 | * 下载某一种页类型的特定页 506 | * 507 | * @param pageType 页类型 508 | * @param page 图书的页码 509 | * @throws PageDLException 某些页下载失败 510 | */ 511 | private void download(PageType pageType, int page, String filename) throws PageDLException { 512 | int pageNumberLength = 6 - pageType.name.length(); 513 | StringBuilder url = new StringBuilder(); 514 | url.append(urlPrefix).append(pageType.name); 515 | for (int i = 0; i < pageNumberLength - String.valueOf(page).length(); i++) { 516 | url.append('0'); 517 | } 518 | url.append(page); 519 | url.append(".jpg"); 520 | String finalurl = url.toString(); 521 | String pathname = directory.resolve(filename).toString(); 522 | try { 523 | downloadImage(finalurl, pathname); 524 | } catch (IOException e) { 525 | try { 526 | downloadImage(finalurl, pathname); 527 | } catch (IOException e1) { 528 | throw new PageDLException(finalurl, pathname); 529 | } 530 | 531 | } 532 | } 533 | 534 | /** 535 | * 输出单页下载失败的日志,可以使用{@link MissingPageCompletion}来读取错误日志并恢复 536 | * 537 | * @param bookPagesDLException 单页失败异常 538 | * @param pageFailLogPath 日志路径 539 | */ 540 | private static void logPageFail(BookPagesDLException bookPagesDLException, String pageFailLogPath) { 541 | Vector pageDLExceptions = bookPagesDLException.getPageDLExceptions(); 542 | for (PageDLException pageDLException : pageDLExceptions) { 543 | writeFile(pageFailLogPath, pageDLException.toString()); 544 | } 545 | 546 | } 547 | 548 | /** 549 | * 输出整本书下载失败的日志,用于后期恢复(暂未完成) 550 | * 551 | * @param bookFailLogPath 日志路径 552 | */ 553 | private void logBookFail(String bookFailLogPath) { 554 | writeFile(bookFailLogPath, book.toString()); 555 | } 556 | 557 | public static void writeFile(String filepath, String content) { 558 | Writer fstream = null; 559 | try { 560 | fstream = new OutputStreamWriter(new FileOutputStream(filepath, true), StandardCharsets.UTF_8); 561 | fstream.write(content); 562 | fstream.write(System.getProperty("line.separator")); 563 | fstream.close(); 564 | } 565 | catch (IOException e) { 566 | e.printStackTrace(); 567 | } 568 | } 569 | 570 | public void setErrorLogPath(String errorLogPath) { 571 | this.errorLogPath = errorLogPath; 572 | } 573 | 574 | /** 575 | * 书页的类型。每本书都由"封面","正文","目录"等若干种固定的页类型组成。 576 | */ 577 | public enum PageType { 578 | COVER("cov", 1), BOOKNAME("bok", 2), LEGALINFO("leg", 3), INTRODUCTION("fow", 4), DIRECTORY("!", 5), 579 | CONTENT("", 6), APPENDIX("att", 7), BACKCOVER("cov", 8); 580 | private String name; 581 | private int index; 582 | 583 | PageType(String name, int index) { 584 | this.name = name; 585 | this.index = index; 586 | } 587 | 588 | public String getName() { 589 | return name; 590 | } 591 | 592 | public int getIndex() { 593 | return index; 594 | } 595 | } 596 | } 597 | -------------------------------------------------------------------------------- /src/main/java/com/njulib/spider/BookSearch.java: -------------------------------------------------------------------------------- 1 | package com.njulib.spider; 2 | 3 | import com.njulib.object.Book; 4 | import com.njulib.object.Books; 5 | import com.njulib.object.RootBookClass; 6 | import org.jsoup.Jsoup; 7 | import org.jsoup.nodes.Document; 8 | import org.jsoup.select.Elements; 9 | import utils.network.MyHttpRequest; 10 | 11 | import java.io.IOException; 12 | import java.net.URLEncoder; 13 | import java.util.HashMap; 14 | import java.util.Map; 15 | import java.util.Set; 16 | 17 | /** 18 | * 查询符合条件的书籍。 19 | *

20 | * 从南京大学馆藏数字化图书平台查询符合条件的书籍。 21 | * 可通过书名或者sql语句查询书籍。 22 | * 可以在查询过程中动态创建图书的分类目录结构。 23 | * 24 | * @author padeoe 25 | * @Date: 2016/12/09 26 | */ 27 | public class BookSearch { 28 | String cookie; 29 | 30 | /** 31 | * 查询 32 | * 33 | * @throws IOException 查询失败 34 | */ 35 | public BookSearch() throws IOException { 36 | this.cookie = NJULib.getSession(); 37 | } 38 | 39 | /** 40 | * 通过指定sql查询的where子句进行图书查询 41 | * 42 | * @param sqlWhereClause 一些已知字段包括"书名","主题词","出版日期","作者" 43 | * @param page 查询结果列表的页码 44 | * @param rootBookClass 查询到的书本将会添加进该分类结构 45 | * @return 查询结果,包含查询到的书本列表,书本总数量和结果总页数 46 | * @throws IOException 查询失败 47 | */ 48 | public Books searchBySQL(String sqlWhereClause, int page, RootBookClass rootBookClass) throws IOException { 49 | String url = NJULib.baseUrl + "/markbook/BookSearch.jsp"; 50 | String data = "Page=" + page + "&MethodType=1" + "&Library=&KeyName=0&Condition=" + URLEncoder.encode(sqlWhereClause, "UTF-8") + "&Sort=&links=0&PSize=10&_="; 51 | Map requestProperty = new HashMap<>(); 52 | requestProperty.put("Content-type", "application/x-www-form-urlencoded; charset=UTF-8"); 53 | String result = MyHttpRequest.postWithCookie(data, url, requestProperty, cookie, "UTF-8", "GBK", 2000); 54 | int totalNums = 0, totalPage = 0; 55 | Document doc = Jsoup.parse(result); 56 | Elements totalNumsNode = doc.select("input[name=TotalNums]"); 57 | if (totalNumsNode != null && totalNumsNode.size() > 0) { 58 | totalNums = Integer.parseInt(totalNumsNode.get(0).attr("value")); 59 | } 60 | Elements totalPageNode = doc.select("a[href]:contains(末页)"); 61 | if (totalPageNode != null && totalPageNode.size() > 0) { 62 | String href = totalPageNode.get(0).attr("href"); 63 | int start = href.indexOf('(') + 1; 64 | int end = href.indexOf(')'); 65 | if (start != 0 && end != -1) { 66 | totalPage = Integer.parseInt(href.substring(start, end)); 67 | } 68 | } 69 | Set books = rootBookClass.queryBooks(result); 70 | return new Books(page, totalPage, totalNums, books); 71 | } 72 | 73 | /** 74 | * 通过指定sql查询的where子句进行图书查询 75 | * 76 | * @param sqlWhereClause where子句,一些已知字段包括"书名","主题词","出版日期","作者" 77 | * @param page 查询结果列表的页码 78 | * @return 如果没有匹配结果,返回空的对象 79 | * @throws IOException 查询失败 80 | */ 81 | public Books searchBySQL(String sqlWhereClause, int page) throws IOException { 82 | return searchBySQL(sqlWhereClause, page, new RootBookClass()); 83 | } 84 | 85 | /** 86 | * 通过指定sql查询的where子句进行图书查询,只返回第一页结果。 87 | * 88 | * @param sqlWhereClause where子句,一些已知字段包括"书名","主题词","出版日期","作者" 89 | * @return 如果没有匹配结果,返回空的对象 90 | * @throws IOException 查询失败 91 | */ 92 | public Books searchBySQL(String sqlWhereClause) throws IOException { 93 | return searchBySQL(sqlWhereClause, 1); 94 | } 95 | 96 | /** 97 | * 通过指定sql查询的where子句进行图书查询 98 | * 99 | * @param sqlWhereClause where子句,一些已知字段包括"书名","主题词","出版日期","作者" 100 | * @return 查询结果,书的集合 101 | * @throws IOException 查询失败 102 | */ 103 | public Set findAllBySQL(String sqlWhereClause) throws IOException { 104 | Set bookSet = null; 105 | Books firstPageBooks = searchBySQL(sqlWhereClause, 1); 106 | bookSet = firstPageBooks.getBookSet(); 107 | for (int i = 2; i <= firstPageBooks.getTotalPage(); i++) { 108 | bookSet.addAll(searchBySQL(sqlWhereClause, i).getBookSet()); 109 | } 110 | return bookSet; 111 | } 112 | 113 | /** 114 | * 通过指定sql查询的where子句进行图书查询,并把查询结果中的图书添加进分类结构 115 | * 116 | * @param sqlWhereClause where子句,一些已知字段包括"书名","主题词","出版日期","作者" 117 | * @param rootBookClass 根分类 118 | * @return 查询结果,书本集合 119 | * @throws IOException 查询失败 120 | */ 121 | public Set findAllBySQL(String sqlWhereClause, RootBookClass rootBookClass) throws IOException { 122 | 123 | Books firstPageBooks = searchBySQL(sqlWhereClause, 1, rootBookClass); 124 | Set bookSet = firstPageBooks.getBookSet(); 125 | for (int i = 2; i <= firstPageBooks.getTotalPage(); i++) { 126 | bookSet.addAll(searchBySQL(sqlWhereClause, i, rootBookClass).getBookSet()); 127 | } 128 | return bookSet; 129 | } 130 | 131 | private Books searchByName(String name) throws IOException { 132 | return searchBySQL("书名 like '%" + name + "%' "); 133 | } 134 | } 135 | -------------------------------------------------------------------------------- /src/main/java/com/njulib/spider/NJULib.java: -------------------------------------------------------------------------------- 1 | package com.njulib.spider; 2 | 3 | import utils.network.MyHttpRequest; 4 | 5 | import java.io.IOException; 6 | 7 | /** 8 | * 用于获取Session 9 | * 10 | * @author padeoe 11 | * Date: 2016/12/08 12 | */ 13 | public class NJULib { 14 | public static final String baseUrl = "http://114.212.7.104:8181"; 15 | 16 | /** 17 | * 获取SeesionId 18 | * 19 | * @return SeesionId 20 | * @throws IOException 出现网络错误 21 | */ 22 | public static String getSession() throws IOException { 23 | System.out.println("正在重置cookie"); 24 | String Url = baseUrl + "/markbook/"; 25 | return MyHttpRequest.getAndGetCookie(Url, null, "UTF-8", 1000)[1]; 26 | } 27 | 28 | } 29 | -------------------------------------------------------------------------------- /src/main/java/com/sslibrary/Start.java: -------------------------------------------------------------------------------- 1 | package com.sslibrary; 2 | 3 | import com.sslibrary.object.Book; 4 | import com.sslibrary.object.BookClass; 5 | import com.sslibrary.object.Books; 6 | import com.sslibrary.object.RootBookClass; 7 | import com.sslibrary.spider.BookSearch; 8 | 9 | import java.io.IOException; 10 | import java.util.List; 11 | import java.util.Set; 12 | 13 | /** 14 | * @author padeoe 15 | * @Date: 2016/12/10 16 | */ 17 | public class Start { 18 | /** 19 | * 一个使用示例。请修改下面代码的两个文件存储路径,再运行。 20 | * 当前示例会下载计算机分类下所有书。 21 | * 下载过程中可以终止程序从而终止下载。下一次下载时会跳过下载分类中已有的书本。 22 | * 23 | * @param args 24 | */ 25 | public static void main(String[] args) { 26 | //创建一个书目分类,此处定义的是0T0P3010 计算机类,具体解释请参考中图法 27 | // 格式必须和南京大学馆藏数字化图书平台一致 28 | BookClass root=new BookClass("0N","自然科学总论",new RootBookClass()); 29 | try { 30 | root.downloadWithCataDir("F:\\Book\\all",5,"F:\\error.log"); 31 | // root.downloadWithCataDir("/opt/seafile/wkk_test/all",5,"/opt/seafile/wkk_test/error.log"); 32 | } catch (IOException e) { 33 | e.printStackTrace(); 34 | } 35 | } 36 | } 37 | -------------------------------------------------------------------------------- /src/main/java/com/sslibrary/fix/FileRenamer.java: -------------------------------------------------------------------------------- 1 | package com.sslibrary.fix; 2 | 3 | import java.io.File; 4 | import java.io.FileInputStream; 5 | import java.io.IOException; 6 | import java.io.InputStream; 7 | import java.nio.file.Files; 8 | import java.nio.file.Path; 9 | import java.nio.file.Paths; 10 | import java.util.Arrays; 11 | 12 | /** 13 | * 重命名之前版本程序下载的文件。 14 | * 之前版本造成了下载的文件命名不合理。 15 | * 16 | * @author padeoe 17 | * @Date: 2016/12/13 18 | */ 19 | public class FileRenamer { 20 | public static void main(String args[]) { 21 | renameZero("G:\\Test\\"); 22 | } 23 | 24 | public static void renameZero(String rootDirPath) { 25 | Path root = Paths.get(rootDirPath); 26 | File rootDir = root.toFile(); 27 | if (rootDir.isDirectory()) { 28 | File dirs[] = rootDir.listFiles(); 29 | Arrays.asList(dirs).parallelStream().forEach(FileRenamer::handleEndDir); 30 | } else { 31 | System.out.println("根目录不是目录,终止"); 32 | } 33 | } 34 | 35 | public static void handleEndDir(File dir) { 36 | if (dir.isDirectory()) { 37 | System.out.println("正在处理" + dir.getName()); 38 | File files[] = dir.listFiles(); 39 | Arrays.asList(files).parallelStream().forEach(file -> rename(dir, file)); 40 | } else { 41 | System.out.println(dir.getName() + "不是目录,跳过"); 42 | } 43 | } 44 | 45 | private static void rename(File dir, File file) { 46 | String name = file.getName(); 47 | if (name.endsWith("png") || name.endsWith("jpg")) { 48 | String prefix = name.substring(0, name.indexOf('.')); 49 | name = name.replaceAll(prefix, String.format("%04d", Integer.parseInt(prefix))); 50 | try { 51 | Files.move(file.toPath(), new File(dir.getPath() + "\\" + name).toPath()); 52 | } catch (IOException e) { 53 | System.out.println(file.toString()); 54 | } 55 | } 56 | } 57 | 58 | public static void renameSuffix(String rootDirPath) { 59 | Path root = Paths.get(rootDirPath); 60 | File rootDir = root.toFile(); 61 | if (rootDir.isDirectory()) { 62 | File dirs[] = rootDir.listFiles(); 63 | Arrays.asList(dirs).parallelStream().forEach(FileRenamer::imageEndDir); 64 | } else { 65 | System.out.println("根目录不是目录,终止"); 66 | } 67 | } 68 | 69 | public static void imageEndDir(File dir) { 70 | if (dir.isDirectory()) { 71 | System.out.println("正在处理" + dir.getName()); 72 | File files[] = dir.listFiles(); 73 | for (File file : files) { 74 | String name = file.getName(); 75 | String prefix = name.substring(0, name.indexOf('.')); 76 | String trueSuffix = getImageSuffix(file); 77 | if ((name.endsWith("png") || name.endsWith("jpg")) && trueSuffix != null && !name.endsWith(trueSuffix)) { 78 | name = prefix + "." + trueSuffix; 79 | // System.out.println("需要修改为"+name); 80 | try { 81 | Files.move(file.toPath(), new File(dir.getPath() + "\\" + name).toPath()); 82 | } catch (IOException e) { 83 | System.out.println("修改出错" + file.toString()); 84 | } 85 | } 86 | } 87 | } else { 88 | System.out.println(dir.getName() + "不是目录,跳过"); 89 | } 90 | } 91 | 92 | 93 | public static String getImageSuffix(File image) { 94 | FileInputStream fileInputStream; 95 | InputStream inputStream; 96 | try { 97 | fileInputStream = new FileInputStream(image); 98 | inputStream = fileInputStream; 99 | byte[] array = new byte[10]; 100 | inputStream.read(array, 0, 10); 101 | if (array[6] == 'J' && array[7] == 'F' && array[8] == 'I' && array[9] == 'F') { 102 | inputStream.close(); 103 | return "jpg"; 104 | } else { 105 | inputStream.close(); 106 | return "png"; 107 | } 108 | } catch (IOException e) { 109 | e.printStackTrace(); 110 | } 111 | return null; 112 | 113 | } 114 | } 115 | -------------------------------------------------------------------------------- /src/main/java/com/sslibrary/fix/MissingPageCompletion.java: -------------------------------------------------------------------------------- 1 | package com.sslibrary.fix; 2 | 3 | import com.sslibrary.spider.BookDownloader; 4 | 5 | import java.io.FileWriter; 6 | import java.io.IOException; 7 | import java.nio.file.Files; 8 | import java.nio.file.Paths; 9 | import java.util.Iterator; 10 | import java.util.List; 11 | import java.util.regex.Matcher; 12 | import java.util.regex.Pattern; 13 | 14 | /** 15 | * 读取下载日志中的错误,进行缺页补全。 16 | * 17 | * @author padeoe 18 | * Date: 2016/12/09 19 | */ 20 | public class MissingPageCompletion { 21 | private String logLocation = Paths.get(System.getProperty("user.dir"), BookDownloader.ERROR_LOG_NAME).toString(); 22 | private Pattern pattern = Pattern.compile("PageDLException\\{url='(.*)', location='(.*)'\\}"); 23 | 24 | /** 25 | * 创建一个{@code MissingPageCompletion}对象并将日志路径指定为{@code logLocation} 26 | * 27 | * @param logLocation 日志文件路径 28 | */ 29 | public MissingPageCompletion(String logLocation) { 30 | this.logLocation = logLocation; 31 | } 32 | 33 | /** 34 | * 读取日志中所有下载失败的单页信息并重新下载一次。 35 | * 重新下载的日志会输入到原日志文件中 36 | */ 37 | public void complete() { 38 | try { 39 | List lines = Files.readAllLines(Paths.get(logLocation)); 40 | Iterator iterator = lines.iterator(); 41 | while (iterator.hasNext()) { 42 | String line = iterator.next(); 43 | Matcher matcher = pattern.matcher(line); 44 | String url, location; 45 | if (matcher.find()) { 46 | url = matcher.group(1); 47 | location = matcher.group(2); 48 | System.out.println(url + " " + location); 49 | try { 50 | BookDownloader.downloadImage(url, location); 51 | iterator.remove(); 52 | } catch (IOException downloadFail) { 53 | } 54 | } 55 | } 56 | 57 | StringBuilder newLog = new StringBuilder(); 58 | lines.forEach(line -> newLog.append(line).append(System.getProperty("line.separator"))); 59 | FileWriter writer = new FileWriter(logLocation, false); 60 | writer.write(newLog.toString()); 61 | writer.close(); 62 | } catch (IOException e) { 63 | e.printStackTrace(); 64 | } 65 | } 66 | 67 | /** 68 | * 获取当前指定的日志的位置。 69 | * 如果没有指定位置,将默认使用当前路径下的名为{@link BookDownloader#ERROR_LOG_NAME}的文件 70 | * 71 | * @return 当前指定的日志的位置 72 | */ 73 | public String getLogLocation() { 74 | return logLocation; 75 | } 76 | 77 | /** 78 | * 指定输入的日志的位置 79 | * 80 | * @param logLocation 作为输入的日志的位置 81 | */ 82 | public void setLogLocation(String logLocation) { 83 | this.logLocation = logLocation; 84 | } 85 | 86 | /** 87 | * 获取当前指定的错误日志的单行格式 88 | * 89 | * @return 错误日志的单行格式 90 | */ 91 | public Pattern getPattern() { 92 | return pattern; 93 | } 94 | 95 | /** 96 | * 设置日志的单行格式 97 | * 98 | * @param pattern 日志的单行格式 99 | */ 100 | public void setPattern(Pattern pattern) { 101 | this.pattern = pattern; 102 | } 103 | 104 | public static void main(String[] args) { 105 | new MissingPageCompletion("/opt/seafile/wkk_test/error.log").complete(); 106 | } 107 | } 108 | -------------------------------------------------------------------------------- /src/main/java/com/sslibrary/fix/Recovery.java: -------------------------------------------------------------------------------- 1 | package com.sslibrary.fix; 2 | 3 | import com.sslibrary.spider.BookDownloader; 4 | 5 | import java.nio.file.Paths; 6 | import java.util.regex.Pattern; 7 | 8 | /** 9 | * 错误恢复类。 10 | * 11 | * 用于对读取错误日志,进行错误恢复。 12 | * 错误主要包括页下载失败和书本下载失败两种。 13 | * @author padeoe 14 | * @Date 2017/1/11. 15 | */ 16 | public class Recovery { 17 | private String logLocation = Paths.get(System.getProperty("user.dir"), BookDownloader.ERROR_LOG_NAME).toString(); 18 | private Pattern PageExceptionPattern = Pattern.compile("PageDLException\\{url='(.*)', location='(.*)'\\}"); 19 | private Pattern BookExceptionPattern = Pattern.compile("Book\\{id='(.*)', name='(.*)', author='(.*)', publishDate='(.*)', theme='(.*)', bookClass='(.*)', detailBookClass='(.*)'\\}"); 20 | private String bookRootLocation; 21 | public static final String FIX_LOG_FILENAME="fix.log"; 22 | 23 | /** 24 | * 创建并初始化一个错误恢复对象。 25 | * 26 | * 指定错误日志文件的路径和书本下载的存储根路径 27 | * @param logLocation 错误日志文件的路径 28 | * @param bookRootLocation 书本下载存储路径的根分类路径 29 | */ 30 | public Recovery(String logLocation, String bookRootLocation) { 31 | this.logLocation = logLocation; 32 | this.bookRootLocation = bookRootLocation; 33 | } 34 | 35 | /** 36 | * 读取错误日志,进行错误恢复 37 | */ 38 | public void recover(){ 39 | 40 | 41 | } 42 | 43 | /** 44 | * 设置页下载失败日志行的格式 45 | * @param pageExceptionPattern 页下载失败日志行的格式 46 | */ 47 | public void setPageExceptionPattern(Pattern pageExceptionPattern) { 48 | PageExceptionPattern = pageExceptionPattern; 49 | } 50 | 51 | /** 52 | * 设置书本下载失败日志行的格式 53 | * @param bookExceptionPattern 书本下载失败日志行的格式 54 | */ 55 | public void setBookExceptionPattern(Pattern bookExceptionPattern) { 56 | BookExceptionPattern = bookExceptionPattern; 57 | } 58 | 59 | } 60 | -------------------------------------------------------------------------------- /src/main/java/com/sslibrary/object/Book.java: -------------------------------------------------------------------------------- 1 | package com.sslibrary.object; 2 | 3 | import cn.chineseall.Node; 4 | import com.sslibrary.object.exception.BookDLException; 5 | import org.jsoup.Jsoup; 6 | import org.jsoup.nodes.Document; 7 | import org.jsoup.select.Elements; 8 | import com.sslibrary.spider.BookDownloader; 9 | import com.sslibrary.spider.NJULib; 10 | import utils.network.MyHttpRequest; 11 | 12 | import java.io.IOException; 13 | import java.net.URLDecoder; 14 | import java.util.List; 15 | 16 | /** 17 | * 图书。 18 | *

19 | * 对应南京大学馆藏数字化图书平台 中的图书。 20 | * 21 | * @author padeoe 22 | * @Date: 2016/12/08 23 | */ 24 | public class Book { 25 | /** 26 | * 书的id,唯一识别号,是由南京大学馆藏数字化图书平台定义的 27 | */ 28 | private String id; 29 | /** 30 | * 书名,应该总是包含书名号《》 31 | */ 32 | private String name; 33 | private String author; 34 | /** 35 | * 书本出本社 36 | */ 37 | private String press; 38 | 39 | private String outlineUrl; 40 | 41 | public String getOutlineUrl() { 42 | return outlineUrl; 43 | } 44 | 45 | public void setOutlineUrl(String outlineUrl) { 46 | this.outlineUrl = outlineUrl; 47 | } 48 | 49 | private List outline; 50 | 51 | public List getOutline() { 52 | return outline; 53 | } 54 | 55 | public void setOutline(List outline) { 56 | this.outline = outline; 57 | } 58 | 59 | /** 60 | * 初始化一个新创建的{@code Book}对象。 61 | *

62 | * 如果你没有足够的参数信息调用该方法创建对象,可调用{@link #getBookFromUrl(String)}通过书本的在线阅读地址获取实例, 63 | * 或者使用{@link com.sslibrary.spider.BookSearch}中的方法根据书名等字段查询并创建满足条件的的图书实例。 64 | * 65 | * @param id 书本id,需要和南京大学馆藏数字化图书平台服务器一致 66 | */ 67 | public Book(String id) { 68 | this.id = id; 69 | } 70 | 71 | /** 72 | * 获取书本的编号 73 | * 74 | * @return 书本编号 75 | */ 76 | public String getId() { 77 | return id; 78 | } 79 | 80 | /** 81 | * 设置书本编号 82 | * 83 | * @param id 书本编号 84 | */ 85 | public void setId(String id) { 86 | this.id = id; 87 | } 88 | 89 | /** 90 | * 获取书本名 91 | * 92 | * @return 书名,包含书名号《》 93 | */ 94 | public String getName() { 95 | return name; 96 | } 97 | 98 | /** 99 | * 设置书名 100 | * 101 | * @param name 书名 102 | */ 103 | public void setName(String name) { 104 | this.name = name; 105 | } 106 | 107 | /** 108 | * 获取书本作者,可能是null 109 | * 110 | * @return 书本作者 111 | */ 112 | public String getAuthor() { 113 | return author; 114 | } 115 | 116 | /** 117 | * 设置书本作者 118 | * 119 | * @param author 书本作者 120 | */ 121 | public void setAuthor(String author) { 122 | this.author = author; 123 | } 124 | 125 | /** 126 | * 获取书本出版社 127 | * @return 128 | */ 129 | public String getPress() { 130 | return press; 131 | } 132 | 133 | /** 134 | * 指定书本出版社 135 | * @param press 136 | */ 137 | public void setPress(String press) { 138 | this.press = press; 139 | } 140 | 141 | /** 142 | * 获取书本出版日期 143 | * 144 | * @return 书本出版日期 145 | */ 146 | public String getPublishDate() { 147 | return publishDate; 148 | } 149 | 150 | public void setPublishDate(String publishDate) { 151 | this.publishDate = publishDate; 152 | } 153 | 154 | /** 155 | * 获取书本主题词,可能是null 156 | * 157 | * @return 书本主题词 158 | */ 159 | public String getTheme() { 160 | return theme; 161 | } 162 | 163 | public void setTheme(String theme) { 164 | this.theme = theme; 165 | } 166 | 167 | /** 168 | * 获取书本所在分类 169 | * 170 | * @return 书本所在分类 171 | */ 172 | public BookClass getBookClass() { 173 | return bookClass; 174 | } 175 | 176 | public void setBookClass(BookClass bookClass) { 177 | this.bookClass = bookClass; 178 | } 179 | 180 | /** 181 | * 获取书本所在末级分类 182 | * 183 | * @return 字符串描述所属分类,最末层的分类,用>分割层级, 184 | * 例如“数理科学和化学图书馆>数学>总论复分>总论” 185 | */ 186 | public String getDetailBookClass() { 187 | return detailBookClass; 188 | } 189 | 190 | public void setDetailBookClass(String detailBookClass) { 191 | this.detailBookClass = detailBookClass; 192 | } 193 | 194 | private String publishDate; 195 | private String theme; 196 | /** 197 | * 所属分类 198 | */ 199 | private BookClass bookClass = new RootBookClass(); 200 | /** 201 | * 所属分类的中文描述。 202 | * “>”分割层级, 203 | * 例如“数理科学和化学图书馆>数学>总论复分>总论” 204 | */ 205 | private String detailBookClass; 206 | 207 | public String getCookie() { 208 | return cookie; 209 | } 210 | 211 | void setCookie(String cookie) { 212 | this.cookie = cookie; 213 | } 214 | 215 | private String cookie; 216 | 217 | /** 218 | * 初始化一个新创建的{@code Book}对象。需要{@code Book}的所有属性。 219 | * 如果你没有足够的参数信息调用该方法创建对象,可调用{@link #getBookFromUrl(String)}通过书本的在线阅读地址获取实例, 220 | * 或者使用{@link com.sslibrary.spider.BookSearch}中的方法根据书名等字段查询并创建满足条件的的图书实例。 221 | * 222 | * @param id {@code Book}的id。该id是服务器命名的 223 | * @param name 书名 224 | * @param author 作者 225 | * @param publishDate 出版日期 226 | * @param theme 主题词 227 | * @param bookClass 书本分类 228 | * @param detailBookClass 书本分类分类名路径 229 | */ 230 | public Book(String id, String name, String author, String publishDate, String theme, BookClass bookClass, String detailBookClass) { 231 | this.id = id; 232 | this.name = name; 233 | this.author = author; 234 | this.publishDate = publishDate; 235 | this.theme = theme; 236 | this.bookClass = bookClass; 237 | this.detailBookClass = detailBookClass; 238 | } 239 | 240 | 241 | /** 242 | * 通过在线阅览的地址来获取{@code Book}对象 243 | * 244 | * @param onlineReadUrl 书本的在线阅读地址 245 | * @return Book对象,仅指定了id 246 | */ 247 | public static Book getBookFromUrl(String onlineReadUrl) { 248 | String[]para=onlineReadUrl.split("/"); 249 | if(para!=null&¶.length>6){ 250 | Book book=new Book(para[para.length-3]); 251 | book.fillBookInfoByUrl(onlineReadUrl); 252 | return book; 253 | } 254 | return null; 255 | } 256 | 257 | /** 258 | * 通过在线阅读页面补全{@code Book}的信息 259 | * 仅可补全{@link #name},{@link #id},{@link #author},{@link #publishDate} 260 | * 261 | * @param url 书本的在线阅读页面 262 | */ 263 | public void fillBookInfoByUrl(String url) { 264 | try { 265 | String html = new BookDownloader(this,url).getBookViewPageHtml(); 266 | Document doc = Jsoup.parse(html); 267 | Elements nameNode = doc.getElementsByTag("title"); 268 | this.name = nameNode.text().replaceAll("\\s+", " "); 269 | Elements infoNode = doc.select("div[id=bookinfo]"); 270 | if(infoNode!=null){ 271 | String bookinfo=infoNode.text(); 272 | int author_end=bookinfo.indexOf(name); 273 | if(author_end!=-1){ 274 | this.author=infoNode.text().substring(0,author_end-1); 275 | } 276 | String [] bookInfo=infoNode.text().substring(author_end,infoNode.text().length()).split(","); 277 | if(bookInfo.length>2){ 278 | this.press=bookInfo[1]; 279 | this.publishDate=bookInfo[2]; 280 | } 281 | } 282 | } catch (BookDLException e) { 283 | e.printStackTrace(); 284 | }catch (Exception e) { 285 | e.printStackTrace(); 286 | } 287 | } 288 | 289 | /** 290 | * 获取书本的在线阅读地址。 291 | * 292 | * @return 书本在线与阅读的URL 293 | * @throws IOException IO错误 294 | */ 295 | public String getbookread() throws IOException { 296 | resetCookie(); 297 | String para = "BID=" + id + "&ReadMode=0&pdfread=0&displaystyle=0"; 298 | String Url = NJULib.baseUrl + "/getbookread?" + para; 299 | String result = MyHttpRequest.getWithCookie(Url, null, cookie, "UTF-8", 1000); 300 | return NJULib.baseUrl + URLDecoder.decode(result, "UTF-8"); 301 | } 302 | 303 | /** 304 | * 重置{@link #cookie} 305 | * 306 | * @throws IOException 重置cookie失败 307 | */ 308 | private void resetCookie() throws IOException { 309 | cookie = (cookie == null) ? NJULib.getSession() : cookie; 310 | } 311 | 312 | @Override 313 | public String toString() { 314 | return "Book{" + 315 | "id='" + id + '\'' + 316 | ", name='" + name + '\'' + 317 | ", author='" + author + '\'' + 318 | ", press='" + press + '\'' + 319 | ", publishDate='" + publishDate + '\'' + 320 | ", theme='" + theme + '\'' + 321 | ", bookClass=" + bookClass + 322 | ", detailBookClass='" + detailBookClass + '\'' + 323 | '}'; 324 | } 325 | 326 | /** 327 | * 下载该书。将下载许多图片,书的每一页都是一张png图片。 328 | * 将会在{@code pathname}下创建一个以书名命名的文件夹,并存储所有图片。 329 | * 错误日志将在当前路径下名为"error.log" 330 | */ 331 | public void download(String onlineReadUrl) { 332 | BookDownloader bookDownloader = new BookDownloader(this,onlineReadUrl); 333 | bookDownloader.downloadAllImages(); 334 | } 335 | 336 | /** 337 | * 下载该书。将下载许多图片,书的每一页都是一张png图片。 338 | * 将会在{@code pathname}下创建一个以书名命名的文件夹,并存储所有图片。 339 | * 错误日志将在当前路径下名为"error.log" 340 | * 341 | * @param pathname 下载存储目录 342 | * @param threadNumber 下载线程数 343 | */ 344 | public void download(String pathname, int threadNumber,String onlineReadUrl) { 345 | BookDownloader bookDownloader = new BookDownloader(this,onlineReadUrl); 346 | bookDownloader.setPath(pathname); 347 | bookDownloader.setThreadNumber(threadNumber); 348 | bookDownloader.downloadAllImages(); 349 | } 350 | 351 | /** 352 | * 下载该书。将下载许多图片,书的每一页都是一张png图片。 353 | * 将会在{@code pathname}下创建一个以书名命名的文件夹,并存储所有图片。 354 | * 355 | * @param pathname 下载存储目录 356 | * @param threadNumber 线程数 357 | * @param errorLogPath 错误日志路径 358 | */ 359 | public void download(String pathname, int threadNumber, String onlineReadUrl,String errorLogPath) { 360 | BookDownloader bookDownloader = new BookDownloader(this,onlineReadUrl); 361 | bookDownloader.setPath(pathname); 362 | bookDownloader.setThreadNumber(threadNumber); 363 | bookDownloader.setErrorLogPath(errorLogPath); 364 | bookDownloader.downloadAllImages(); 365 | } 366 | 367 | 368 | @Override 369 | public int hashCode() { 370 | return Integer.parseInt(this.getId()); 371 | } 372 | 373 | @Override 374 | public boolean equals(Object obj) { 375 | if (!(obj instanceof Book)) 376 | return false; 377 | if (obj == this) 378 | return true; 379 | return this.id.equals(((Book) obj).id); 380 | } 381 | 382 | public com.njulib.object.Book cast(){ 383 | return new com.njulib.object.Book(id,name,author,publishDate,theme,null,null); 384 | } 385 | } 386 | -------------------------------------------------------------------------------- /src/main/java/com/sslibrary/object/BookClass.java: -------------------------------------------------------------------------------- 1 | package com.sslibrary.object; 2 | 3 | import org.jsoup.Jsoup; 4 | import org.jsoup.nodes.Document; 5 | import org.jsoup.nodes.Element; 6 | import org.jsoup.select.Elements; 7 | import com.sslibrary.spider.BookDownloader; 8 | import com.sslibrary.spider.NJULib; 9 | import utils.conversion.MyDecoder; 10 | import utils.network.MyHttpRequest; 11 | 12 | import java.io.IOException; 13 | import java.nio.file.Paths; 14 | import java.util.*; 15 | import java.util.concurrent.atomic.AtomicInteger; 16 | import java.util.regex.Matcher; 17 | import java.util.regex.Pattern; 18 | import java.util.stream.Collectors; 19 | 20 | /** 21 | * 图书的分类。 22 | *

23 | * 对应南京大学馆藏数字化图书平台 中的图书。 24 | * 同时分类名和分类编号满足中图法分类。是树结构。具有查询子分类和查询分类下书籍列表,批量下载分类书籍等功能。 25 | * 如果你没有足够信息构造实例,可以通过{@link RootBookClass}查询所有分类来获取实例。 26 | * 或者{@link com.sslibrary.spider.BookSearch}中的一些方法获取实例。 27 | * 28 | * @author padeoe 29 | * @Date: 2016/12/08 30 | */ 31 | public class BookClass { 32 | /** 33 | * 分类id,服务器定义的中图法分类id, 34 | * 例如"0T0P3010" 35 | */ 36 | private String id; 37 | /** 38 | * 分类名称 39 | */ 40 | private String name; 41 | /** 42 | * 父分类 43 | */ 44 | private BookClass parent; 45 | /** 46 | * 子分类列表 47 | */ 48 | private Map children; 49 | 50 | 51 | /** 52 | * 子分类{@link #children}是否已经被加载 53 | */ 54 | private boolean isLoaded = false; 55 | 56 | /** 57 | * 查看当对象所使用的cookie 58 | * 59 | * @return cookie 60 | */ 61 | public String getCookie() { 62 | return cookie; 63 | } 64 | 65 | /** 66 | * 设置{@code cookie},BookClass的子每一次子分类加载, 67 | * 书籍查询等操作都需要cookie,设置的cookie将会对所有子分类使用, 68 | * 以避免频繁获取cookie 69 | * 70 | * @param cookie cookie 71 | */ 72 | public void setCookie(String cookie) { 73 | this.cookie = cookie; 74 | } 75 | 76 | /** 77 | * 查询分类信息时向服务器发送的cookie,初始=null。 78 | * 当调用了需要网络的方法时,将会被初始化。 79 | * 一个{@link BookClass}对象的所有子分类{@link #children}都是用的同一个cookie 80 | */ 81 | private String cookie; 82 | 83 | /** 84 | * 获取子分类的数量 85 | * 86 | * @return 子分类的数量 87 | */ 88 | public int getChildCount() { 89 | return children.size(); 90 | } 91 | 92 | /** 93 | * 获取父分类 94 | * 95 | * @return 父分类。如果不存在则为null 96 | */ 97 | public BookClass getParent() { 98 | return parent; 99 | } 100 | 101 | 102 | /** 103 | * 获取所有子分类。 104 | * 初始为null,若要查看子分类,必须先调用{@link #loadChild()}或者{@link #loadAllChild()}从服务器查询并加载 105 | * 106 | * @return 子分类的集合 107 | */ 108 | public Set getChildren() { 109 | return children.values().stream().collect(Collectors.toSet()); 110 | } 111 | 112 | /** 113 | * 查询特定子分类。 114 | * 115 | * @param idOrName 子分类的名称或者代号。符合中图法分类。 116 | * @return 子分类 117 | */ 118 | public BookClass getChild(String idOrName) { 119 | return children.get(idOrName); 120 | } 121 | 122 | public String getName() { 123 | return name; 124 | } 125 | 126 | public void setName(String name) { 127 | this.name = name; 128 | } 129 | 130 | public String getId() { 131 | return id; 132 | } 133 | 134 | public void setId(String id) { 135 | this.id = id; 136 | } 137 | 138 | public void setParent(BookClass parent) { 139 | this.parent = parent; 140 | } 141 | 142 | public boolean isTerminal() { 143 | return false; 144 | } 145 | 146 | /** 147 | * 添加一个子分类 148 | * 149 | * @param bookClass 子分类 150 | * @return 如果同id的子分类已存在,则返回之前的子分类,如果不存在,则添加并返回null 151 | */ 152 | public BookClass addChild(BookClass bookClass) { 153 | if (bookClass.name != null) { 154 | children.putIfAbsent(bookClass.name, bookClass); 155 | } 156 | return children.putIfAbsent(bookClass.id, bookClass); 157 | } 158 | 159 | /** 160 | * 创建并初始化一个书本分类。指定分类编号,分类名称和父分类。 161 | * 162 | * @param id 分类编号 163 | * @param name 分类名称 164 | * @param parent 父分类 165 | */ 166 | public BookClass(String id, String name, BookClass parent) { 167 | this.id = id; 168 | this.name = name; 169 | this.parent = parent; 170 | children = new HashMap<>(); 171 | } 172 | 173 | /** 174 | * 创建并初始化一个书本分类。指定分类编号,分类名称和父分类。 175 | * 176 | * @param id 分类编号 177 | * @param name 分类名称 178 | */ 179 | public BookClass(String id, String name) { 180 | this.id = id; 181 | this.name = name; 182 | children = new HashMap<>(); 183 | } 184 | 185 | 186 | /** 187 | * 创建一个新初始化的{@code BookClass}对象, 188 | * 使之中图法分类标识是{@code id} 189 | * 190 | * @param id 分类的中图法分类标识。 191 | * 需要和南京大学馆藏数字化图书平台定义的格式一致 192 | */ 193 | public BookClass(String id) { 194 | this.id = id; 195 | children = new HashMap<>(); 196 | this.isLoaded = false; 197 | } 198 | 199 | /** 200 | * 加载子分类。仅加载一层子分类,即子分类的子分类不会被加载。 201 | * 当该方法被调用时,会向服务器查询该分类的子分类并更新该对象的{@link #children} 202 | *

203 | * 如需递归加载子分类,调用{@link #loadAllChild()} 204 | * 205 | * @throws IOException 从服务器查询子节点出错 206 | */ 207 | public void loadChild() throws IOException { 208 | if (!isTerminal()) { 209 | checkCookie(); 210 | String Url = NJULib.baseUrl + "/classifyview"; 211 | String data = "fenlei=" + this.getId() + "&lib=markbook"; 212 | String result = MyHttpRequest.postWithCookie(data, Url, null, cookie, "UTF-8", "UTF-8", 1000); 213 | // System.out.println(result); 214 | Document doc = Jsoup.parse(result); 215 | Elements li = doc.getElementsByTag("li"); 216 | for (Element bookClassId : li) { 217 | String id = bookClassId.attr("id"); 218 | String name = bookClassId.getElementsByTag("a").text(); 219 | boolean hasSubTree = bookClassId.getElementsByTag("img").attr("onClick").contains("getSubTree"); 220 | //System.out.println(id+" "+NJULib.decodeUrlUnicode(name)); 221 | BookClass child = hasSubTree ? new BookClass(id, MyDecoder.decodeUrlUnicode(name), this) : 222 | new TerminalBookClass(id, MyDecoder.decodeUrlUnicode(name), this); 223 | child.setCookie(cookie); 224 | this.addChild(child); 225 | } 226 | this.isLoaded = true; 227 | } 228 | } 229 | 230 | 231 | /** 232 | * 迭代加载所有子分类。 233 | * 直至加载到每个分类的末层分类。 234 | * 235 | * @throws IOException 从服务器查询时出错 236 | */ 237 | public void loadAllChild() throws IOException { 238 | if (!isTerminal()) { 239 | loadChild(); 240 | for (BookClass child : getChildren()) { 241 | child.loadAllChild(); 242 | } 243 | } 244 | } 245 | 246 | 247 | /** 248 | * 下载分类下所有图书,会迭代测创建分类文件夹 249 | * 250 | * @param pathname 存储路径。将在该路径下创建多级分类目录并保存下载的图书 251 | * @param threadNumber 线程数 252 | * @param errorLogPath 错误日志路径 253 | * @throws IOException 连接失败的错误 254 | */ 255 | public void downloadWithCataDir(String pathname, int threadNumber, String errorLogPath) throws IOException { 256 | if (!isTerminal()) { 257 | loadChild(); 258 | BookClass[]bookClasses=getChildren().toArray(new BookClass[]{}); 259 | for(int i=bookClasses.length-1;i>=0;i--){ 260 | bookClasses[i].downloadWithCataDir(Paths.get(pathname, name == null ? id : name).toString(), threadNumber, errorLogPath); 261 | } 262 | /* for (BookClass child : getChildren()) { 263 | child.downloadWithCataDir(Paths.get(pathname, name == null ? id : name).toString(), threadNumber, errorLogPath); 264 | }*/ 265 | } else { 266 | downloadAllBooks(Paths.get(pathname, name == null ? id : name).toString(), threadNumber, errorLogPath); 267 | } 268 | } 269 | 270 | /** 271 | * 下载分类下所有图书,会迭代测创建分类文件夹 272 | * 下载存储路径为当前路径,线程数为5,错误日志将保存在当前路径,文件名为{@link BookDownloader#ERROR_LOG_NAME} 273 | * 可以调用重载{@link #downloadWithCataDir(String, int, String)}设置参数 274 | * 275 | * @throws IOException 连接失败的错误 276 | */ 277 | public void downloadWithCataDir() throws IOException { 278 | downloadWithCataDir(System.getProperty("user.dir"), 5, Paths.get(System.getProperty("user.dir"), BookDownloader.ERROR_LOG_NAME).toString()); 279 | } 280 | 281 | /** 282 | * 从服务器获取该分类下图书列表的第{@code page}页。 283 | * 图书列表的分页是服务器做的,每页最多10条图书。 284 | *

285 | * 页数的最大值可以根据{@link #queryBooksSize()}自行计算 286 | * 287 | * @param page 图书列表的页码 288 | * @return 列表该页记录的图书 289 | * @throws IOException 从服务器查询书本列表时出错 290 | */ 291 | public Set queryBooks(int page) throws IOException { 292 | checkCookie(); 293 | String data = "fenlei=" + this.id + "&mark=all&Page=" + page + "&totalnumber=-1"; 294 | String Url = NJULib.baseUrl + "/markbook/classifybookview.jsp"; 295 | String html = MyHttpRequest.postWithCookie(data, Url, null, cookie, "UTF-8", "GBK", 1000); 296 | // System.out.println(html); 297 | Set books = queryBooks(html); 298 | return books; 299 | 300 | } 301 | 302 | /** 303 | * 获得某分类下的所有图书 304 | * 305 | * @return 分类下所有图书 306 | * @throws IOException 从服务器查询书本列表时出错 307 | */ 308 | public Set queryAllBooks() throws IOException { 309 | return queryAllBooks(1); 310 | } 311 | 312 | /** 313 | * 获得分类下的所有图书 314 | * 315 | * @param threadNumber 线程数 316 | * @return 图书集合 317 | * @throws IOException 连接错误 318 | */ 319 | public Set queryAllBooks(int threadNumber) throws IOException { 320 | checkCookie(); 321 | String data = "fenlei=" + this.id + "&mark=all&Page=1&totalnumber=-1"; 322 | String Url = NJULib.baseUrl + "/markbook/classifybookview.jsp"; 323 | String html = MyHttpRequest.postWithCookie(data, Url, null, cookie, "UTF-8", "GBK", 1000); 324 | // System.out.println(html); 325 | Document doc = Jsoup.parse(html); 326 | Elements form = doc.select("a:contains(末页)"); 327 | 328 | if (!form.isEmpty()) { 329 | String keyword = form.get(0).attr("href"); 330 | String booksize = keyword.substring(keyword.lastIndexOf(",") + 1, keyword.length() - 1); 331 | int size = Integer.parseInt(booksize); 332 | System.out.println("一共 " + size + " 本书"); 333 | Set books = queryBooks(html); 334 | List threadList = new ArrayList<>(); 335 | 336 | AtomicInteger needGettedPage = new AtomicInteger(2);//需要获取的页码 337 | int lastPage = size / 10 + 1;//最后一页的页码 338 | //开始多线程刷所有页码 339 | for (int threadN = 0; threadN < threadNumber; threadN++) { 340 | threadList.add(new PageGetThread(needGettedPage, lastPage)); 341 | } 342 | 343 | for (PageGetThread thread : threadList) { 344 | thread.start(); 345 | } 346 | for (PageGetThread thread : threadList) { 347 | try { 348 | thread.join(); 349 | } catch (InterruptedException e) { 350 | e.printStackTrace(); 351 | } 352 | } 353 | threadList.forEach(pageGetThread -> books.addAll(pageGetThread.getThreadBooks())); 354 | return books; 355 | } 356 | return null; 357 | } 358 | 359 | /** 360 | * 下载分类下所有图书。 361 | * 所有书籍将直接保存在{@code pathname}目录下,每本书一个文件夹,以书名命名。如同名,则加作者名,如又同名,加书本编号 362 | * 363 | * @param pathname 存储路径。书本文件夹所在的上级路径 364 | * @param threadNumber 线程数 365 | * @param errorLogPath 错误日志路径 366 | * @throws IOException 连接失败的错误 367 | */ 368 | public void downloadAllBooks(String pathname, int threadNumber, String errorLogPath) throws IOException { 369 | checkCookie(); 370 | String data = "fenlei=" + this.id + "&mark=all&Page=1&totalnumber=-1"; 371 | String Url = NJULib.baseUrl + "/markbook/classifybookview.jsp"; 372 | String html = MyHttpRequest.postWithCookie(data, Url, null, cookie, "UTF-8", "GBK", 1000); 373 | // System.out.println(html); 374 | Document doc = Jsoup.parse(html); 375 | Elements form = doc.select("a:contains(末页)"); 376 | if (!form.isEmpty()) { 377 | String keyword = form.get(0).attr("href"); 378 | String booksize = keyword.substring(keyword.lastIndexOf(",") + 1, keyword.length() - 1); 379 | int size = Integer.parseInt(booksize); 380 | System.out.println(this.getPath()+"一共 " + size + " 本书"); 381 | Set books = queryBooks(html); 382 | Set downloading; 383 | downloadBooks(books, pathname, threadNumber, errorLogPath); 384 | int lastPage = size / 10 + 1;//最后一页的页码 385 | int index = 1; 386 | for (int i = lastPage; i >= 2; i--) { 387 | downloading = queryBooks(i); 388 | for (Book book : downloading) { 389 | if (books.add(book)) { 390 | book.download(pathname, threadNumber, errorLogPath); 391 | index++; 392 | } else { 393 | System.out.println("服务器返回了重复书籍,跳过 " + book); 394 | } 395 | } 396 | } 397 | System.out.println("去重后共" + books.size() + "书,实际下载了" + (index + 10) + "本书(含失败)"); 398 | } 399 | } 400 | 401 | private void downloadBooks(Set books, String pathname, int threadNumber, String errorLogPath) { 402 | for (Book book : books) { 403 | book.download(pathname, threadNumber, errorLogPath); 404 | } 405 | } 406 | 407 | 408 | /** 409 | * 获取所有图书列表的线程 410 | */ 411 | class PageGetThread extends Thread { 412 | Set books = new HashSet<>(); 413 | AtomicInteger needGettedPage; 414 | int lastPage; 415 | 416 | public PageGetThread(AtomicInteger needGettedPage, int lastPage) { 417 | this.needGettedPage = needGettedPage; 418 | this.lastPage = lastPage; 419 | } 420 | 421 | @Override 422 | public void run() { 423 | while (true) { 424 | int gettingpage = needGettedPage.getAndIncrement(); 425 | if (gettingpage <= lastPage) { 426 | try { 427 | if (gettingpage % 10 == 0) { 428 | resetCookie(); 429 | } 430 | books.addAll(queryBooks(gettingpage)); 431 | } catch (IOException e) { 432 | e.printStackTrace(); 433 | } 434 | } else { 435 | break; 436 | } 437 | } 438 | } 439 | 440 | public Set getThreadBooks() { 441 | return books; 442 | } 443 | } 444 | 445 | 446 | /** 447 | * 获取HTML文本中的书籍并根据其分类添加进当前的分类结构 448 | * 449 | * @param html 服务器特定页面返回的包含书本信息的HTML文本。 450 | * 服务器多个不同页面返回的包含书本信息的HTML中书本信息相关节点的格式都相似。均可调用本函数 451 | * @return HTML中记录的书本 452 | */ 453 | public Set queryBooks(String html) { 454 | Document doc = Jsoup.parse(html); 455 | Elements booksliNode = doc.select("li[style]"); 456 | return queryBooks(booksliNode); 457 | } 458 | 459 | private Set queryBooks(Elements booksliNode) { 460 | Set books = new HashSet<>(); 461 | for (Element element : booksliNode) { 462 | //获取书名和id 463 | String name = null, id = null, author = null, publishDate = null, theme = null, detailBookClass = null; 464 | BookClass bookBookClass; 465 | Elements nameIdNode = element.select("p[class=name]"); 466 | if (nameIdNode != null) { 467 | name = nameIdNode.text(); 468 | Elements idNode = nameIdNode.select("a[onclick]"); 469 | if (idNode != null && idNode.size() > 0) { 470 | String idOnClick = idNode.get(0).attr("onclick"); 471 | int start = idOnClick.indexOf("(") + 1, end = idOnClick.lastIndexOf(","); 472 | if (start != 0 && end != -1) { 473 | id = idOnClick.substring(start, end); 474 | } 475 | } 476 | } 477 | //获取分类 478 | BookClass[] bookClasses = new BookClass[0]; 479 | Elements infoNode = element.select("p[class=info]"); 480 | if (infoNode != null) { 481 | Elements bookInfos = infoNode.select("a"); 482 | if (bookInfos != null && bookInfos.size() > 0) { 483 | Element terminalCataNode = bookInfos.last(); 484 | bookInfos.remove(terminalCataNode); 485 | List tmplist = bookInfos.stream() 486 | .map(bookInfo -> getBookCata(bookInfo, false)) 487 | .filter(Objects::nonNull) 488 | .collect(Collectors.toList()); 489 | BookClass terminalBookClass = getBookCata(terminalCataNode, true); 490 | if (terminalBookClass != null) { 491 | tmplist.add(terminalBookClass); 492 | } 493 | bookClasses = tmplist.toArray(bookClasses); 494 | } 495 | } 496 | bookBookClass = new RootBookClass().link(bookClasses); 497 | 498 | //获取作者,出版日期,主题词,分类 499 | String info = element.text(); 500 | Pattern pattern = Pattern.compile("\\d+\\. (.*) 作者[::](.*) 出版日期[::](\\d+).*?(?:主题词[::](.+))? 分类[::](.*)"); 501 | Matcher matcher = pattern.matcher(info); 502 | while (matcher.find()) { 503 | name = matcher.group(1); 504 | author = matcher.group(2); 505 | publishDate = matcher.group(3); 506 | theme = matcher.group(4); 507 | detailBookClass = matcher.group(5); 508 | } 509 | Pattern minPattern = Pattern.compile(".*(《.*》).*"); 510 | Matcher minMatcher = minPattern.matcher(info); 511 | while (minMatcher.find()) { 512 | name = minMatcher.group(1); 513 | } 514 | 515 | //汇总书本 516 | if (name != null && id != null) { 517 | Book book = new Book(id, name, author, publishDate, theme, bookBookClass, detailBookClass); 518 | book.setCookie(cookie); 519 | books.add(book); 520 | if (bookBookClass.isTerminal()) { 521 | ((TerminalBookClass) bookBookClass).addBook(book); 522 | } else { 523 | System.out.println("未获取到分类信息,将不被归档 " + book); 524 | } 525 | } else { 526 | System.out.println("error: " + info); 527 | } 528 | } 529 | return books; 530 | } 531 | 532 | 533 | /** 534 | * 通过HTML中对应节点获取到书所在分类 535 | * 536 | * @param bookInfo 书本信息的HTML节点 537 | * @param isTerminal 是否是终端分类 538 | * @return 书所在分类。如果是终端分类将会返回{@code TerminalBookClass} 539 | */ 540 | private BookClass getBookCata(Element bookInfo, boolean isTerminal) { 541 | String cataName = bookInfo.text(); 542 | String href = bookInfo.attr("href"); 543 | if (href != null) { 544 | int cataIdStart = href.indexOf('=') + 1; 545 | if (cataIdStart != 0) { 546 | String cataId = href.substring(href.indexOf('=') + 1, href.length()); 547 | BookClass tmp = isTerminal ? new TerminalBookClass(cataId) : new BookClass(cataId); 548 | tmp.setName(cataName); 549 | return tmp; 550 | } 551 | 552 | } 553 | return null; 554 | } 555 | 556 | 557 | /** 558 | * 从服务器查询当前分类下图书的数量。包含所有子分类下的图书 559 | * 560 | * @return 当前分类下图书的数量 561 | * @throws IOException 查询失败 562 | */ 563 | public int queryBooksSize() throws IOException { 564 | checkCookie(); 565 | String data = "fenlei=" + this.getId() + "&mark=all&Page=1&totalnumber=-1"; 566 | String Url = NJULib.baseUrl + "/markbook/classifybookview.jsp"; 567 | String html = MyHttpRequest.postWithCookie(data, Url, null, cookie, "UTF-8", "GBK", 1000); 568 | // System.out.println(html); 569 | Document doc = Jsoup.parse(html); 570 | Elements form = doc.select("input[name=totalnumber]"); 571 | if (!form.isEmpty()) { 572 | String booksize = form.get(0).attr("value"); 573 | return Integer.parseInt(booksize); 574 | } 575 | return 0; 576 | } 577 | 578 | 579 | /** 580 | * 检查{@code cookie}如果为null将会更新cookie 581 | * 582 | * @throws IOException 更新cookie失败 583 | */ 584 | private void checkCookie() throws IOException { 585 | cookie = (cookie == null) ? NJULib.getSession() : cookie; 586 | } 587 | 588 | /** 589 | * 重置{@code cookie} 590 | * 591 | * @throws IOException 重置cookie失败 592 | */ 593 | private void resetCookie() throws IOException { 594 | cookie = NJULib.getSession(); 595 | } 596 | 597 | /** 598 | * 对当前分类添加子分类 599 | * 600 | * @param childBookClasses 顺次路径关系子分类,后一个是前一个的子分类。第一个是当前分类的子分类 601 | * @return 子分类的最后一级分类.若子路径参数为空,则为当前分类 602 | */ 603 | public BookClass link(BookClass... childBookClasses) { 604 | BookClass currentBookClass = this; 605 | for (BookClass bookClass : childBookClasses) { 606 | BookClass previois = currentBookClass.addChild(bookClass); 607 | if (previois != null) { 608 | currentBookClass = previois; 609 | } else { 610 | bookClass.parent = currentBookClass; 611 | currentBookClass = bookClass; 612 | } 613 | } 614 | return currentBookClass; 615 | } 616 | 617 | /** 618 | * 获取分类对象所有终端分类下已存储的书籍 619 | *

620 | * 不会触发网络请求,只是迭代收集子分类的下已存在的书籍。 621 | * 如要即时从服务器查询书籍,请调用{@link #queryAllBooks()}及其重载 622 | * 623 | * @return 该分类下属所有分类的图书集合 624 | */ 625 | public Set getBooks() { 626 | return this.getChildren().stream().map(BookClass::getBooks).collect(HashSet::new, Set::addAll, Set::addAll); 627 | } 628 | 629 | 630 | /** 631 | * 判断两个{@code BookClass}是否是同一个分类。 632 | * 仅根据代号即{@link BookClass#id}来判断 633 | * 634 | * @param obj 任意对象 635 | * @return 对象是否是同一个分类 636 | */ 637 | @Override 638 | public boolean equals(Object obj) { 639 | if (!(obj instanceof BookClass)) 640 | return false; 641 | if (obj == this) 642 | return true; 643 | return this.id.equals(((BookClass) obj).id); 644 | } 645 | 646 | /** 647 | * 获取分类所在的路径。 648 | * 返回可读的{@code String},对二级分类到当前分类顺次所经路径分别调用{@link BookClass#toString()},用"-"分割 649 | * 650 | * @return 从二级分类到当前分类顺次所经路径,用"-"分隔分类 651 | */ 652 | public String getPath() { 653 | Stack parents = new Stack<>(); 654 | BookClass bookClass = this; 655 | while (bookClass!=null&&!bookClass.isRoot()) { 656 | parents.push(bookClass); 657 | bookClass = bookClass.getParent(); 658 | } 659 | StringBuilder sb = new StringBuilder(); 660 | if (!parents.isEmpty()) { 661 | sb.append(parents.pop().toString()); 662 | } 663 | while (!parents.isEmpty()) { 664 | sb.append("-"); 665 | sb.append(parents.pop().toString()); 666 | } 667 | return sb.toString(); 668 | } 669 | 670 | /** 671 | * 用于判断{@link BookClass}对象是不是{@link RootBookClass}的实例 672 | * 673 | * @return 是否是根分类 674 | */ 675 | public boolean isRoot() { 676 | return false; 677 | } 678 | 679 | /** 680 | * 返回{@code BookClass}的哈希值。 681 | * 会直接使用用{@link #id}的哈希值 682 | * 683 | * @return 哈希值 684 | */ 685 | @Override 686 | public int hashCode() { 687 | return id.hashCode(); 688 | } 689 | 690 | /** 691 | * 返回{@code BookClass}的可读字符串描述。 692 | * 693 | * @return 格式是 "分类代号(分类名)",如果分类名为null,则格式是"分类代号" 694 | */ 695 | @Override 696 | public String toString() { 697 | return this.getId() + (this.getName() == null ? "" : "(" + this.getName() + ")"); 698 | } 699 | } 700 | -------------------------------------------------------------------------------- /src/main/java/com/sslibrary/object/Books.java: -------------------------------------------------------------------------------- 1 | package com.sslibrary.object; 2 | 3 | import java.util.Set; 4 | 5 | /** 6 | * 书本查询的结果。{@link com.sslibrary.spider.BookSearch}类某些方法的返回值用到本类 7 | * 包含了查询出的图书当前页集合,以及查询结果的总页数,书本总数。 8 | * 9 | * @author padeoe 10 | * @Date: 2016/12/09 11 | */ 12 | public class Books { 13 | private int page; 14 | private int totalNums; 15 | private int totalPage; 16 | private Set bookSet; 17 | 18 | /** 19 | * @param page 当前页数 20 | * @param totalPage 总页数 21 | * @param totalNums 总书本数 22 | * @param bookSet 本页的书 23 | */ 24 | public Books(int page, int totalPage, int totalNums, Set bookSet) { 25 | this.totalPage = totalPage; 26 | this.bookSet = bookSet; 27 | } 28 | 29 | /** 30 | * 获取查询到的图书总数 31 | * 32 | * @return 查询到的图书总数 33 | */ 34 | public int getTotalNums() { 35 | return totalNums; 36 | } 37 | 38 | public int getPage() { 39 | return page; 40 | } 41 | 42 | public int getTotalPage() { 43 | return totalPage; 44 | } 45 | 46 | public Set getBookSet() { 47 | return bookSet; 48 | } 49 | } 50 | -------------------------------------------------------------------------------- /src/main/java/com/sslibrary/object/InfoReader.java: -------------------------------------------------------------------------------- 1 | package com.sslibrary.object; 2 | 3 | import com.sslibrary.spider.BookDownloader; 4 | 5 | import java.io.IOException; 6 | import java.nio.charset.StandardCharsets; 7 | import java.nio.file.Files; 8 | import java.nio.file.Paths; 9 | import java.util.List; 10 | import java.util.regex.Matcher; 11 | import java.util.regex.Pattern; 12 | 13 | /** 14 | * info文件解析器。 15 | *

16 | * info文件是由{@link com.sslibrary.spider.BookDownloader}在下载过程中创建的文本文件。 17 | * 记录了一个{@link Book#toString()} 18 | * 默认名称是{@link com.sslibrary.spider.BookDownloader#INFO_FILE_NAME}。 19 | * 该类会读取info文件并解析出{@link Book}对象 20 | * 21 | * @author padeoe 22 | * @Date: 2016/12/11 23 | */ 24 | public class InfoReader { 25 | private String infoFilePath; 26 | 27 | public InfoReader(String infoFilePath) { 28 | this.infoFilePath = infoFilePath; 29 | } 30 | 31 | /** 32 | * 解析{@code Book}对象,如果未找到返回null 33 | * 34 | * @return {@code Book}对象 35 | */ 36 | public Book read() { 37 | try { 38 | List lines = Files.readAllLines(Paths.get(infoFilePath), StandardCharsets.UTF_8); 39 | String info = ""; 40 | if (lines.size() > 0) { 41 | info = lines.get(0); 42 | } 43 | Pattern pattern = Pattern.compile("Book\\{id='(.*)', name='(.*)', author='(.*)', publishDate='(.*)', theme='(.*)', bookClass=(.*), detailBookClass='(.*)'\\}"); 44 | Matcher matcher = pattern.matcher(info); 45 | if (matcher.find()) { 46 | return new Book(matcher.group(1), 47 | matcher.group(2), 48 | matcher.group(3), 49 | matcher.group(4), 50 | matcher.group(5), 51 | new BookClass(matcher.group(6)), 52 | matcher.group(7)); 53 | } 54 | return null; 55 | } catch (IOException e) { 56 | e.printStackTrace(); 57 | return null; 58 | } 59 | } 60 | } 61 | -------------------------------------------------------------------------------- /src/main/java/com/sslibrary/object/RootBookClass.java: -------------------------------------------------------------------------------- 1 | package com.sslibrary.object; 2 | 3 | /** 4 | * 根分类 5 | *

6 | * 根分类是在中图法分类之外虚拟出的分类。 7 | * 用于集合管理所有子分类,以及作为起点,从服务器获取子分类。 8 | * 9 | * @author padeoe 10 | * @Date: 2016/12/20 11 | */ 12 | public class RootBookClass extends BookClass { 13 | public RootBookClass() { 14 | super("all"); 15 | } 16 | 17 | /** 18 | * 用于判断{@link BookClass}对象是不是{@link RootBookClass}的实例 19 | * 20 | * @return true 21 | */ 22 | @Override 23 | public boolean isRoot() { 24 | return true; 25 | } 26 | 27 | /** 28 | * 用于判断{@link BookClass}对象是不是{@link TerminalBookClass}的实例 29 | * 30 | * @return false 31 | */ 32 | @Override 33 | public boolean isTerminal() { 34 | return false; 35 | } 36 | } 37 | -------------------------------------------------------------------------------- /src/main/java/com/sslibrary/object/TerminalBookClass.java: -------------------------------------------------------------------------------- 1 | package com.sslibrary.object; 2 | 3 | import java.util.HashSet; 4 | import java.util.Set; 5 | 6 | /** 7 | * 终端分类。即分类的最末层。 8 | *

9 | * 采用的是中图法分类,例如"哲学宗教-哲学理论-辩证唯物主义-总论"的最后一个"总论"就是一个终端分类。 10 | * 只有终端分类下可以存储图书。 11 | * 12 | * @author padeoe 13 | * @Date: 2016/12/20 14 | */ 15 | public class TerminalBookClass extends BookClass { 16 | private Set books = new HashSet<>(); 17 | 18 | /** 19 | * 创建一个新初始化的{@code BookClass}对象, 20 | * 使之中图法分类标识是{@code id} 21 | * 22 | * @param id 分类的中图法分类标识。 23 | * 需要和南京大学馆藏数字化图书平台定义的格式一致 24 | */ 25 | public TerminalBookClass(String id) { 26 | super(id); 27 | } 28 | 29 | 30 | /** 31 | * 构造函数。 32 | * 33 | * @param id 分类编号 34 | * @param name 分类名 35 | * @param parent 父分类 36 | */ 37 | public TerminalBookClass(String id, String name, BookClass parent) { 38 | super(id, name, parent); 39 | } 40 | 41 | /** 42 | * 获取分类下的书籍 43 | * 该方法只是返回该分类下现有书籍,不会向服务器查询该分类下所有图书。 44 | * 如需向服务器查询,请调用{@link BookClass#queryAllBooks()}及其重载方法 45 | * 46 | * @return 分类下的书籍。 47 | */ 48 | public Set getBooks() { 49 | return books; 50 | } 51 | 52 | /** 53 | * 用于判断{@link BookClass}对象是不是{@link TerminalBookClass}的实例 54 | * 55 | * @return true 56 | */ 57 | @Override 58 | public boolean isTerminal() { 59 | return true; 60 | } 61 | 62 | 63 | /** 64 | * 用于判断{@link BookClass}对象是不是{@link RootBookClass}的实例 65 | * 66 | * @return false 67 | */ 68 | @Override 69 | public boolean isRoot() { 70 | return false; 71 | } 72 | 73 | /** 74 | * 增加分类下图书 75 | * 76 | * @param book 图书 77 | * @return 如果分类下已有该图书,将返回false。如果没有,将添加并返回true 78 | */ 79 | public boolean addBook(Book book) { 80 | return books.add(book); 81 | } 82 | } 83 | -------------------------------------------------------------------------------- /src/main/java/com/sslibrary/object/exception/BookDLException.java: -------------------------------------------------------------------------------- 1 | package com.sslibrary.object.exception; 2 | 3 | import com.sslibrary.object.Book; 4 | 5 | /** 6 | * 下载某一本书时发生错误。此异常发生在该书对应的文件夹创建之前。因此此书没有任何文件被下载。 7 | * 8 | * @author padeoe 9 | * Date: 2016/12/12 10 | */ 11 | public class BookDLException extends Exception { 12 | /** 13 | * 发生下载错误的书籍 14 | */ 15 | private Book book; 16 | 17 | /** 18 | * 创意一个初始化的{@code BookDLException},并指定发生错误的书籍。 19 | * 20 | * @param book 发生下载错误的书籍 21 | */ 22 | public BookDLException(Book book) { 23 | this.book = book; 24 | } 25 | 26 | /** 27 | * 获取发生下载错误的书籍 28 | * 29 | * @return 发生下载错误的书籍 30 | */ 31 | public Book getBook() { 32 | return book; 33 | } 34 | } 35 | -------------------------------------------------------------------------------- /src/main/java/com/sslibrary/object/exception/BookPagesDLException.java: -------------------------------------------------------------------------------- 1 | package com.sslibrary.object.exception; 2 | 3 | import java.util.Vector; 4 | 5 | /** 6 | * 下载某一本书时发生错误。 7 | *

8 | * 此异常发生在书本对应文件夹已经创建之后。 9 | * 包含了此书所有的书页下载错误{@code PageDLException},用于错误恢复 10 | * 11 | * @author padeoe 12 | * Date: 2016/12/10 13 | */ 14 | public class BookPagesDLException extends Exception { 15 | Vector pageDLExceptions; 16 | 17 | /** 18 | * 构造一个{@code BookPagesDLException},用此书所有的书页下载错误初始化 19 | * 20 | * @param pageDLExceptionList 此书所有的书页下载错误 21 | */ 22 | public BookPagesDLException(Vector pageDLExceptionList) { 23 | this.pageDLExceptions = pageDLExceptionList; 24 | } 25 | 26 | /** 27 | * 获取页错误的集合 28 | * 29 | * @return 此书所有的书页下载错误{@code PageDLException} 30 | */ 31 | public Vector getPageDLExceptions() { 32 | return pageDLExceptions; 33 | } 34 | } 35 | -------------------------------------------------------------------------------- /src/main/java/com/sslibrary/object/exception/PageDLException.java: -------------------------------------------------------------------------------- 1 | package com.sslibrary.object.exception; 2 | 3 | /** 4 | * 下载图书的某一页时失败。 5 | *

6 | * 该类包含了错误现场的信息,可用于错误恢复与后期处理 7 | * 8 | * @author padeoe 9 | * Date: 2016/12/10 10 | */ 11 | public class PageDLException extends Exception { 12 | private String url; 13 | private String location; 14 | 15 | /** 16 | * 创建并初始化一个{@code PageDLException}对象。指定下载地址和存储地址。 17 | * 18 | * @param url 出错页图片的网络地址 19 | * @param location 出错页图片本应存储的本地路径。不含图片后缀名 20 | */ 21 | public PageDLException(String url, String location) { 22 | super(); 23 | this.url = url; 24 | this.location = location; 25 | } 26 | 27 | /** 28 | * 获取出错页的URL 29 | * 30 | * @return 出错页的URL 31 | */ 32 | public String getUrl() { 33 | return url; 34 | } 35 | 36 | /** 37 | * 获取出错页图片本应存储的本地路径。 38 | * 39 | * @return 出错页图片本应存储的本地路径。不含图片后缀名 40 | */ 41 | public String getLocation() { 42 | return location; 43 | } 44 | 45 | @Override 46 | public String toString() { 47 | return "PageDLException{" + 48 | "url='" + url + '\'' + 49 | ", location='" + location + '\'' + 50 | '}'; 51 | } 52 | } 53 | -------------------------------------------------------------------------------- /src/main/java/com/sslibrary/spider/BookSearch.java: -------------------------------------------------------------------------------- 1 | package com.sslibrary.spider; 2 | 3 | import com.sslibrary.object.Book; 4 | import com.sslibrary.object.Books; 5 | import com.sslibrary.object.RootBookClass; 6 | import org.jsoup.Jsoup; 7 | import org.jsoup.nodes.Document; 8 | import org.jsoup.select.Elements; 9 | import utils.network.MyHttpRequest; 10 | 11 | import java.io.IOException; 12 | import java.net.URLEncoder; 13 | import java.util.HashMap; 14 | import java.util.Map; 15 | import java.util.Set; 16 | 17 | /** 18 | * 查询符合条件的书籍。 19 | *

20 | * 从南京大学馆藏数字化图书平台查询符合条件的书籍。 21 | * 可通过书名或者sql语句查询书籍。 22 | * 可以在查询过程中动态创建图书的分类目录结构。 23 | * 24 | * @author padeoe 25 | * @Date: 2016/12/09 26 | */ 27 | public class BookSearch { 28 | String cookie; 29 | 30 | /** 31 | * 查询 32 | * 33 | * @throws IOException 查询失败 34 | */ 35 | public BookSearch() throws IOException { 36 | this.cookie = NJULib.getSession(); 37 | } 38 | 39 | /** 40 | * 通过指定sql查询的where子句进行图书查询 41 | * 42 | * @param sqlWhereClause 一些已知字段包括"书名","主题词","出版日期","作者" 43 | * @param page 查询结果列表的页码 44 | * @param rootBookClass 查询到的书本将会添加进该分类结构 45 | * @return 查询结果,包含查询到的书本列表,书本总数量和结果总页数 46 | * @throws IOException 查询失败 47 | */ 48 | public Books searchBySQL(String sqlWhereClause, int page, RootBookClass rootBookClass) throws IOException { 49 | String url = NJULib.baseUrl + "/markbook/BookSearch.jsp"; 50 | String data = "Page=" + page + "&MethodType=1" + "&Library=&KeyName=0&Condition=" + URLEncoder.encode(sqlWhereClause, "UTF-8") + "&Sort=&links=0&PSize=10&_="; 51 | Map requestProperty = new HashMap<>(); 52 | requestProperty.put("Content-type", "application/x-www-form-urlencoded; charset=UTF-8"); 53 | String result = MyHttpRequest.postWithCookie(data, url, requestProperty, cookie, "UTF-8", "GBK", 2000); 54 | int totalNums = 0, totalPage = 0; 55 | Document doc = Jsoup.parse(result); 56 | Elements totalNumsNode = doc.select("input[name=TotalNums]"); 57 | if (totalNumsNode != null && totalNumsNode.size() > 0) { 58 | totalNums = Integer.parseInt(totalNumsNode.get(0).attr("value")); 59 | } 60 | Elements totalPageNode = doc.select("a[href]:contains(末页)"); 61 | if (totalPageNode != null && totalPageNode.size() > 0) { 62 | String href = totalPageNode.get(0).attr("href"); 63 | int start = href.indexOf('(') + 1; 64 | int end = href.indexOf(')'); 65 | if (start != 0 && end != -1) { 66 | totalPage = Integer.parseInt(href.substring(start, end)); 67 | } 68 | } 69 | Set books = rootBookClass.queryBooks(result); 70 | return new Books(page, totalPage, totalNums, books); 71 | } 72 | 73 | /** 74 | * 通过指定sql查询的where子句进行图书查询 75 | * 76 | * @param sqlWhereClause where子句,一些已知字段包括"书名","主题词","出版日期","作者" 77 | * @param page 查询结果列表的页码 78 | * @return 如果没有匹配结果,返回空的对象 79 | * @throws IOException 查询失败 80 | */ 81 | public Books searchBySQL(String sqlWhereClause, int page) throws IOException { 82 | return searchBySQL(sqlWhereClause, page, new RootBookClass()); 83 | } 84 | 85 | /** 86 | * 通过指定sql查询的where子句进行图书查询,只返回第一页结果。 87 | * 88 | * @param sqlWhereClause where子句,一些已知字段包括"书名","主题词","出版日期","作者" 89 | * @return 如果没有匹配结果,返回空的对象 90 | * @throws IOException 查询失败 91 | */ 92 | public Books searchBySQL(String sqlWhereClause) throws IOException { 93 | return searchBySQL(sqlWhereClause, 1); 94 | } 95 | 96 | /** 97 | * 通过指定sql查询的where子句进行图书查询 98 | * 99 | * @param sqlWhereClause where子句,一些已知字段包括"书名","主题词","出版日期","作者" 100 | * @return 查询结果,书的集合 101 | * @throws IOException 查询失败 102 | */ 103 | public Set findAllBySQL(String sqlWhereClause) throws IOException { 104 | Set bookSet = null; 105 | Books firstPageBooks = searchBySQL(sqlWhereClause, 1); 106 | bookSet = firstPageBooks.getBookSet(); 107 | for (int i = 2; i <= firstPageBooks.getTotalPage(); i++) { 108 | bookSet.addAll(searchBySQL(sqlWhereClause, i).getBookSet()); 109 | } 110 | return bookSet; 111 | } 112 | 113 | /** 114 | * 通过指定sql查询的where子句进行图书查询,并把查询结果中的图书添加进分类结构 115 | * 116 | * @param sqlWhereClause where子句,一些已知字段包括"书名","主题词","出版日期","作者" 117 | * @param rootBookClass 根分类 118 | * @return 查询结果,书本集合 119 | * @throws IOException 查询失败 120 | */ 121 | public Set findAllBySQL(String sqlWhereClause, RootBookClass rootBookClass) throws IOException { 122 | 123 | Books firstPageBooks = searchBySQL(sqlWhereClause, 1, rootBookClass); 124 | Set bookSet = firstPageBooks.getBookSet(); 125 | for (int i = 2; i <= firstPageBooks.getTotalPage(); i++) { 126 | bookSet.addAll(searchBySQL(sqlWhereClause, i, rootBookClass).getBookSet()); 127 | } 128 | return bookSet; 129 | } 130 | 131 | private Books searchByName(String name) throws IOException { 132 | return searchBySQL("书名 like '%" + name + "%' "); 133 | } 134 | } 135 | -------------------------------------------------------------------------------- /src/main/java/com/sslibrary/spider/NJULib.java: -------------------------------------------------------------------------------- 1 | package com.sslibrary.spider; 2 | 3 | import utils.network.MyHttpRequest; 4 | 5 | import java.io.IOException; 6 | 7 | /** 8 | * 用于获取Session 9 | * 10 | * @author padeoe 11 | * Date: 2016/12/08 12 | */ 13 | public class NJULib { 14 | public static final String baseUrl = "http://114.212.7.104:8181"; 15 | 16 | /** 17 | * 获取SeesionId 18 | * 19 | * @return SeesionId 20 | * @throws IOException 出现网络错误 21 | */ 22 | public static String getSession() throws IOException { 23 | System.out.println("正在重置cookie"); 24 | String Url = baseUrl + "/markbook/"; 25 | return MyHttpRequest.getAndGetCookie(Url, null, "UTF-8", 1000)[1]; 26 | } 27 | 28 | } 29 | -------------------------------------------------------------------------------- /src/main/java/com/sslibrary/spider/PDFGenerator.java: -------------------------------------------------------------------------------- 1 | package com.sslibrary.spider; 2 | 3 | import cn.chineseall.Node; 4 | import com.itextpdf.kernel.pdf.*; 5 | import com.itextpdf.kernel.pdf.action.PdfAction; 6 | import com.itextpdf.kernel.pdf.navigation.PdfExplicitDestination; 7 | import com.sslibrary.object.Book; 8 | 9 | import java.io.*; 10 | import java.util.List; 11 | 12 | /** 13 | * Created by padeoe on 2017/4/24. 14 | */ 15 | public class PDFGenerator { 16 | static String converComamndLocation="D:\\ImageMagick-7.0.5-Q16\\convert.exe"; 17 | File sourceDir; 18 | File outputDir; 19 | Book book; 20 | 21 | public PDFGenerator(File sourceDir, File outputDir, Book book) { 22 | this.sourceDir = sourceDir; 23 | this.outputDir = outputDir; 24 | this.book = book; 25 | } 26 | 27 | public void make(){ 28 | String outputPath=outputDir.toPath().resolve(sourceDir.getName()).toString(); 29 | String[] commands = new String[]{converComamndLocation, "-density","300","-units","PixelsPerInch",sourceDir.getPath()+System.getProperty("file.separator")+"*p*", outputPath+"-tmp.pdf"}; 30 | Runtime runtime = Runtime.getRuntime(); 31 | Process process; 32 | System.out.println(book.getName()+"开始合成pdf"); 33 | try { 34 | process = runtime.exec(commands); 35 | InputStream is = process.getErrorStream(); 36 | InputStreamReader isr = new InputStreamReader(is); 37 | BufferedReader bf = new BufferedReader(isr); 38 | String line; 39 | while ((line = bf.readLine()) != null) { 40 | System.out.println(line); 41 | } 42 | process.waitFor(); 43 | addBookMark(book,outputPath+"-tmp.pdf",outputPath+".pdf"); 44 | } catch (IOException e) { 45 | e.printStackTrace(); 46 | } catch (InterruptedException e) { 47 | e.printStackTrace(); 48 | } 49 | addBookMark(book,outputPath+"-tmp.pdf",outputPath+".pdf"); 50 | 51 | } 52 | 53 | public static void addBookMark(Book book,String src,String dest){ 54 | PdfDocument pdfDoc = null; 55 | try { 56 | pdfDoc = new PdfDocument(new PdfReader(src), new PdfWriter(dest)); 57 | PdfOutline root = pdfDoc.getOutlines(false); 58 | PdfDocumentInfo info=pdfDoc.getDocumentInfo(); 59 | info.setTitle(book.getName()); 60 | info.setAuthor(book.getAuthor()); 61 | List nodes = book.getOutline(); 62 | addOutline(nodes, root, pdfDoc); 63 | pdfDoc.close(); 64 | } catch (IOException e) { 65 | e.printStackTrace(); 66 | } 67 | } 68 | 69 | 70 | private static void addOutline(List nodes, PdfOutline root, PdfDocument pdfDocument) { 71 | for (Node node : nodes) { 72 | PdfOutline child = root.addOutline(node.getTitle()); 73 | child.addAction(PdfAction.createGoTo( 74 | PdfExplicitDestination.createFitH(pdfDocument.getPage(node.getPage()), 75 | pdfDocument.getPage(node.getPage()).getPageSize().getTop()))); 76 | addOutline(node.getChildren(), child, pdfDocument); 77 | 78 | } 79 | } 80 | 81 | 82 | } 83 | -------------------------------------------------------------------------------- /src/main/java/utils/ImageMeger.java: -------------------------------------------------------------------------------- 1 | package utils; 2 | 3 | import com.njulib.fix.ListBook; 4 | import com.njulib.object.InfoReader; 5 | import com.njulib.spider.BookDownloader; 6 | import utils.conversion.PDFTool; 7 | 8 | import java.io.File; 9 | import java.io.FileNotFoundException; 10 | import java.net.MalformedURLException; 11 | import java.nio.file.Path; 12 | import java.nio.file.Paths; 13 | import java.util.Arrays; 14 | import java.util.List; 15 | import java.util.stream.Collectors; 16 | 17 | /** 18 | * Created by padeoe on 2017/4/27. 19 | */ 20 | public class ImageMeger { 21 | private String rootDir; 22 | private String outputDir; 23 | 24 | public static void main(String[] args) { 25 | // args=new String[]{"G:\\Book","G:\\BookPDF"}; 26 | ImageMeger imageMeger = new ImageMeger(args[0], args[1]); 27 | imageMeger.start(); 28 | } 29 | 30 | public void start() { 31 | final int[] i = {1}; 32 | ListBook.getAllBooksAndDir(new File(rootDir)).filter( 33 | bookAndDir-> !(Paths.get(outputDir,bookAndDir.getDir().getName()+".pdf").toFile().exists()) 34 | ).forEach( 35 | bookAndDir -> { 36 | try { 37 | PDFTool.generatePDFFromImage( 38 | Arrays.stream( 39 | bookAndDir.getDir().listFiles()).filter( 40 | file -> file.getName().endsWith(".png") || file.getName().endsWith("jpg") 41 | ).toArray(File[]::new), 42 | Paths.get(outputDir, bookAndDir.getDir().getName() + ".pdf").toFile(), 43 | bookAndDir.getBook() 44 | ); 45 | String bookName=bookAndDir.getBook().getName(); 46 | String output="\r"+i[0] + " "+bookName; 47 | StringBuffer spaces=new StringBuffer(); 48 | for(int k=0;k<80-output.length();k++){ 49 | spaces.append(" "); 50 | } 51 | System.out.print(output+spaces.toString()); 52 | i[0]++; 53 | 54 | BookDownloader.deleteDir(bookAndDir.getDir()); 55 | 56 | } catch (Exception e) { 57 | e.printStackTrace(); 58 | } 59 | } 60 | ); 61 | } 62 | 63 | public ImageMeger(String rootDir, String outputDir) { 64 | this.rootDir = rootDir; 65 | this.outputDir = outputDir; 66 | } 67 | 68 | public String getRootDir() { 69 | return rootDir; 70 | } 71 | 72 | public void setRootDir(String rootDir) { 73 | this.rootDir = rootDir; 74 | } 75 | 76 | public String getOutputDir() { 77 | return outputDir; 78 | } 79 | 80 | public void setOutputDir(String outputDir) { 81 | this.outputDir = outputDir; 82 | } 83 | } 84 | -------------------------------------------------------------------------------- /src/main/java/utils/conversion/MyDecoder.java: -------------------------------------------------------------------------------- 1 | package utils.conversion; 2 | 3 | import java.util.regex.Matcher; 4 | import java.util.regex.Pattern; 5 | 6 | /** 7 | * @author padeoe 8 | * @Date 2016/12/21 9 | */ 10 | public class MyDecoder { 11 | /** 12 | * 将url编码的unicode转成utf-8编码的字符串 13 | * 14 | * @param input 类似"%u7ecf%u5178%u7406%u8bba"的格式 15 | * @return 解码的字符串 16 | */ 17 | public static String decodeUrlUnicode(String input) { 18 | Pattern pattern = Pattern.compile("%u?([A-Za-z0-9]{2,4})"); 19 | StringBuilder builder = new StringBuilder(); 20 | Matcher matcher = pattern.matcher(input); 21 | while (matcher.find()) { 22 | builder.append((char) Integer.parseInt(matcher.group(1), 16)); 23 | } 24 | return builder.toString(); 25 | } 26 | } 27 | -------------------------------------------------------------------------------- /src/main/java/utils/conversion/PDFMerge.java: -------------------------------------------------------------------------------- 1 | package utils.conversion; 2 | 3 | import com.itextpdf.kernel.pdf.PdfDocument; 4 | import com.itextpdf.kernel.pdf.PdfReader; 5 | import com.itextpdf.kernel.pdf.PdfWriter; 6 | import org.apache.pdfbox.io.MemoryUsageSetting; 7 | import org.apache.pdfbox.multipdf.PDFMergerUtility; 8 | 9 | import java.io.File; 10 | import java.io.IOException; 11 | import java.nio.file.Path; 12 | 13 | public class PDFMerge { 14 | 15 | public static void mergePDFs(File[] pdfs, Path outFilePath) throws IOException { 16 | PDFMergerUtility PDFmerger = new PDFMergerUtility(); 17 | PDFmerger.setDestinationFileName(outFilePath.toString()); 18 | for (File file : pdfs) PDFmerger.addSource(file); 19 | PDFmerger.mergeDocuments(MemoryUsageSetting.setupMixed(1024 * 1024 * 500)); 20 | } 21 | 22 | public static void compressPDF(Path originPDF, Path outfilePath) throws IOException { 23 | PdfReader pdfReader = new PdfReader(originPDF.toString()); 24 | PdfDocument inputPdfDoc = new PdfDocument(pdfReader); 25 | File outputPDF = new File(outfilePath.toString()); 26 | PdfDocument outPdfDoc = new PdfDocument(new PdfWriter(outputPDF.getPath() 27 | ).setSmartMode(true)); 28 | 29 | int size = inputPdfDoc.getNumberOfPages(); 30 | inputPdfDoc.copyPagesTo(1, size, outPdfDoc); 31 | outPdfDoc.close(); 32 | inputPdfDoc.close(); 33 | } 34 | 35 | 36 | } 37 | -------------------------------------------------------------------------------- /src/main/java/utils/conversion/PDFTool.java: -------------------------------------------------------------------------------- 1 | package utils.conversion; 2 | 3 | import com.itextpdf.io.image.ImageDataFactory; 4 | import com.itextpdf.kernel.events.Event; 5 | import com.itextpdf.kernel.events.IEventHandler; 6 | import com.itextpdf.kernel.events.PdfDocumentEvent; 7 | import com.itextpdf.kernel.geom.PageSize; 8 | import com.itextpdf.kernel.geom.Rectangle; 9 | import com.itextpdf.kernel.pdf.PdfDocument; 10 | import com.itextpdf.kernel.pdf.PdfDocumentInfo; 11 | import com.itextpdf.kernel.pdf.PdfPage; 12 | import com.itextpdf.kernel.pdf.PdfWriter; 13 | import com.itextpdf.kernel.pdf.canvas.PdfCanvas; 14 | import com.itextpdf.layout.Canvas; 15 | import com.itextpdf.layout.element.Image; 16 | import com.njulib.object.Book; 17 | 18 | import java.io.*; 19 | import java.net.MalformedURLException; 20 | import java.util.*; 21 | import java.util.stream.Collectors; 22 | 23 | /** 24 | * 用于处理前期图片产物,压制成pdf,并给pdf添加书本信息 25 | * Created by padeoe on 2017/4/26. 26 | */ 27 | public class PDFTool { 28 | 29 | /** 30 | * 将图片合成为一个PDF 31 | * @param inputImage 图片,格式为图片格式 32 | * @param outputPDF 输出文件 33 | * @throws FileNotFoundException 34 | * @throws MalformedURLException 35 | */ 36 | public static void generatePDFFromImage(File[] inputImage,File outputPDF) throws FileNotFoundException, MalformedURLException { 37 | Listimages=new LinkedList<>(); 38 | for(File file:inputImage){ 39 | images.add(new Image(ImageDataFactory.create(file.getPath()))); 40 | } 41 | PdfDocument pdfDoc = new PdfDocument(new PdfWriter(outputPDF.getPath())); 42 | 43 | images.forEach(image -> pdfDoc.addNewPage(new PageSize(new Rectangle(image.getImageScaledWidth(), image.getImageScaledHeight())))); 44 | BackgroundEventHandler handler = new BackgroundEventHandler(images); 45 | pdfDoc.addEventHandler(PdfDocumentEvent.END_PAGE, handler); 46 | pdfDoc.close(); 47 | } 48 | 49 | public static void generatePDFFromImage(File[] inputImage,File outputPDF,Book book) throws FileNotFoundException{ 50 | List sorted = Arrays.asList(inputImage); 51 | Collections.sort(sorted, Comparator.comparing(File::getName)); 52 | inputImage=sorted.toArray(new File[]{}); 53 | Listimages=new LinkedList<>(); 54 | boolean hasException=false; 55 | for(File file:inputImage){ 56 | try { 57 | images.add(new Image(ImageDataFactory.create(file.getPath()))); 58 | } catch (MalformedURLException e) { 59 | System.err.println(file.getPath()); 60 | e.printStackTrace(); 61 | } catch (com.itextpdf.io.IOException eee){ 62 | hasException=true; 63 | } 64 | } 65 | PdfDocument pdfDoc = new PdfDocument(new PdfWriter(outputPDF.getPath())); 66 | PdfDocumentInfo info=pdfDoc.getDocumentInfo(); 67 | if(book.getName()!=null&&!book.getName().equals("null")){ 68 | info.setTitle(book.getName()); 69 | } 70 | if(book.getAuthor()!=null&&!book.getAuthor().equals("null")){ 71 | info.setAuthor(book.getAuthor()); 72 | } 73 | if(book.getTheme()!=null&&!book.getTheme().equals("null")){ 74 | info.setSubject(book.getTheme()); 75 | } 76 | StringBuffer keyword=new StringBuffer(); 77 | if(book.getPublishDate()!=null&&!book.getPublishDate().equals("null")){ 78 | keyword.append("出版时间:"+book.getPublishDate()+"\n"); 79 | } 80 | if(book.getBookClass()!=null&&!book.getBookClass().equals("null")){ 81 | keyword.append("分类:"+book.getDetailBookClass().replaceAll("图书馆","")); 82 | } 83 | info.setKeywords(keyword.toString()); 84 | if(hasException){ 85 | System.err.println(book.getName()+" 图片格式异常"); 86 | info.setCreator("exception"); 87 | } 88 | 89 | images.forEach(image -> pdfDoc.addNewPage(new PageSize(new Rectangle(image.getImageScaledWidth(), image.getImageScaledHeight())))); 90 | BackgroundEventHandler handler = new BackgroundEventHandler(images); 91 | pdfDoc.addEventHandler(PdfDocumentEvent.END_PAGE, handler); 92 | pdfDoc.close(); 93 | } 94 | 95 | private static class BackgroundEventHandler implements IEventHandler { 96 | protected List images; 97 | protected int offset=0; 98 | 99 | public BackgroundEventHandler(List images) { 100 | this.images = images; 101 | } 102 | @Override 103 | public void handleEvent(Event event) { 104 | PdfDocumentEvent docEvent = (PdfDocumentEvent) event; 105 | PdfDocument pdfDoc = docEvent.getDocument(); 106 | PdfPage page = docEvent.getPage(); 107 | PdfCanvas canvas = new PdfCanvas(page.newContentStreamBefore(), 108 | page.getResources(), pdfDoc); 109 | Rectangle area = page.getPageSize(); 110 | new Canvas(canvas, pdfDoc, area) 111 | .add(images.get(offset)); 112 | offset++; 113 | } 114 | } 115 | } 116 | -------------------------------------------------------------------------------- /src/main/java/utils/network/MyByteArray.java: -------------------------------------------------------------------------------- 1 | package utils.network; 2 | 3 | /** 4 | * @author Nifury 5 | * Date: 2015/12/17 6 | */ 7 | public class MyByteArray { 8 | private byte[] buffer = new byte[4096]; 9 | private int position = 0; 10 | 11 | public void ensureCapacity(int capacity) { 12 | if (buffer.length - position < capacity) { 13 | byte[] tmp = new byte[Math.max(buffer.length * 2, buffer.length + capacity)]; 14 | System.arraycopy(buffer, 0, tmp, 0, position); 15 | buffer = tmp; 16 | } 17 | } 18 | 19 | public void addOffset(int delta) { 20 | position += delta; 21 | } 22 | 23 | public byte[] getBuffer() { 24 | return buffer; 25 | } 26 | 27 | public int getOffset() { 28 | return position; 29 | } 30 | 31 | public int getSize() { 32 | return position; 33 | } 34 | 35 | public static void main(String[] args) { 36 | MyByteArray array = new MyByteArray(); 37 | } 38 | } 39 | -------------------------------------------------------------------------------- /src/main/java/utils/network/MyHttpRequest.java: -------------------------------------------------------------------------------- 1 | package utils.network; 2 | 3 | import java.io.IOException; 4 | import java.io.InputStream; 5 | import java.io.OutputStream; 6 | import java.io.UnsupportedEncodingException; 7 | import java.net.*; 8 | import java.util.Arrays; 9 | import java.util.HashMap; 10 | import java.util.List; 11 | import java.util.Map; 12 | 13 | /** 14 | * 该类用于负责http网络请求,包含get,set等方法 15 | * 16 | * @author padeoe, Nifury 17 | * Date: 2016/12/09 18 | */ 19 | public class MyHttpRequest { 20 | public static String[] action(String action, String data, String URL, Map requestProperty, String cookie, String inputEncoding, String outputEncoding, int timeout) throws IOException { 21 | ReturnData returnData = action_returnbyte(action, data, URL, requestProperty, cookie, inputEncoding, timeout); 22 | String result = null; 23 | if (returnData.data != null) { 24 | result = new String(returnData.data, 0, returnData.data.length, outputEncoding); 25 | } 26 | List cookies = returnData.getHeaders().get("Set-Cookie"); 27 | if (cookies != null && cookies.get(0) != null) { 28 | return new String[]{result, cookies.get(0)}; 29 | } 30 | return new String[]{result}; 31 | } 32 | 33 | /** 34 | * POST请求 35 | * 36 | * @param action post或get请求 37 | * @param data 数据 38 | * @param URL 服务器地址 39 | * @param requestProperty 请求头 40 | * @param cookie cookie若无则置为空 41 | * @param inputEncoding 请求编码 42 | * @param timeout 超时时间 43 | * @return 字符串数组,第一个元素是响应数据,若长度为2则第二个是返回的cookie 44 | * @throws IOException 网络错误 45 | */ 46 | public static ReturnData action_returnbyte(String action, String data, String URL, Map requestProperty, String cookie, String inputEncoding, int timeout) throws IOException { 47 | byte[] dataAsBytes = new byte[]{}; 48 | if (data != null) { 49 | dataAsBytes = data.getBytes(inputEncoding); 50 | } 51 | java.net.URL url = new URL(URL); 52 | HttpURLConnection connection = (HttpURLConnection) url 53 | .openConnection(/*new Proxy(Proxy.Type.SOCKS, new InetSocketAddress("127.0.0.1", 1080))*/); 54 | connection.setConnectTimeout(timeout); 55 | connection.setRequestMethod(action); 56 | if (action.toLowerCase().equals("post")) { 57 | connection.setDoOutput(true); 58 | } 59 | // connection.setUseCaches(false); 60 | /* java 1.6 does not support 61 | requestProperty.forEach((k,v) -> connection.setRequestProperty(k, v)); 62 | */ 63 | if (requestProperty != null) { 64 | for (Map.Entry entry : requestProperty.entrySet()) { 65 | connection.setRequestProperty(entry.getKey(), entry.getValue()); 66 | } 67 | } 68 | if (data != null) { 69 | connection.setRequestProperty("Content-Length", String.valueOf(dataAsBytes.length)); 70 | } 71 | 72 | if (cookie != null) { 73 | connection.setRequestProperty("Cookie", cookie); 74 | } 75 | connection.connect(); 76 | 77 | /* java 1.6 do not support 78 | try (OutputStream outputStream = connection.getOutputStream()) { 79 | outputStream.write(dataAsBytes); 80 | }*/ 81 | if (data != null) { 82 | OutputStream outputStream = null; 83 | try { 84 | outputStream = connection.getOutputStream(); 85 | outputStream.write(dataAsBytes); 86 | } finally { 87 | if (outputStream != null) { 88 | outputStream.close(); 89 | } 90 | 91 | } 92 | } 93 | 94 | //读取返回数据 95 | utils.network.MyByteArray myByteArray = new utils.network.MyByteArray(); 96 | /* java 1.6 do not support 97 | try (InputStream inputStream = connection.getInputStream()) { 98 | len = inputStream.read(readData); 99 | }*/ 100 | 101 | 102 | InputStream inputStream = null; 103 | Map> headers; 104 | /* if(connection.getURL().toString().indexOf(";")!=-1){ 105 | headers=new HashMap<>(); 106 | headers.put("Set-Cookie", Arrays.asList(connection.getURL().toString().split(";")[1])); 107 | connection.disconnect(); 108 | return new ReturnData(null, headers); 109 | }*/ 110 | 111 | try { 112 | inputStream = connection.getInputStream(); 113 | while (true) { 114 | myByteArray.ensureCapacity(4096); 115 | int len = inputStream.read(myByteArray.getBuffer(), myByteArray.getOffset(), 4096); 116 | if (len == -1) { 117 | break; 118 | } 119 | myByteArray.addOffset(len); 120 | } 121 | 122 | } finally { 123 | if (inputStream != null) { 124 | { 125 | inputStream.close(); 126 | } 127 | } 128 | } 129 | headers = connection.getHeaderFields(); 130 | 131 | connection.disconnect(); 132 | byte[] bytes = new byte[myByteArray.getSize()]; 133 | System.arraycopy(myByteArray.getBuffer(),0,bytes,0,bytes.length); 134 | return new ReturnData(bytes, headers); 135 | } 136 | 137 | /** 138 | * 获得cookie的POST请求 139 | * 140 | * @param postData 请求数据 141 | * @param URL 服务器地址 142 | * @param requestProperty 请求头 143 | * @param inputEncoding 请求编码 144 | * @param outputEncoding 响应编码 145 | * @param timeout 超时时间 146 | * @return 字符串数组,第一个元素是响应数据,第二个是返回的cookie 147 | * @throws IOException 网络错误 148 | */ 149 | public static String[] postAndGetCookie(String postData, String URL, Map requestProperty, String inputEncoding, String outputEncoding, int timeout) throws IOException { 150 | return action("POST", postData, URL, requestProperty, null, inputEncoding, outputEncoding, timeout); 151 | } 152 | 153 | /** 154 | * 发送cookie的POST请求 155 | * 156 | * @param postData 请求数据 157 | * @param URL 服务器地址 158 | * @param requestProperty 请求头 159 | * @param cookie 发送的cookie 160 | * @param inputEncoding 请求编码 161 | * @param outputEncoding 响应编码 162 | * @param timeout 超时时间 163 | * @return 响应数据 164 | * @throws IOException 网络错误 165 | */ 166 | public static String postWithCookie(String postData, String URL, Map requestProperty, String cookie, String inputEncoding, String outputEncoding, int timeout) throws IOException { 167 | return action("POST", postData, URL, requestProperty, cookie, inputEncoding, outputEncoding, timeout)[0]; 168 | } 169 | 170 | /** 171 | * POST请求(不含cookie) 172 | * 173 | * @param postData 请求数据 174 | * @param URL 服务器地址 175 | * @param requestProperty 请求头 176 | * @param inputEncoding 请求编码 177 | * @param outputEncoding 响应编码 178 | * @param timeout 超时时间 179 | * @return 响应数据 180 | * @throws IOException 网络错误 181 | */ 182 | public static String post(String postData, String URL, Map requestProperty, String inputEncoding, String outputEncoding, int timeout) throws IOException { 183 | return action("POST", postData, URL, requestProperty, null, inputEncoding, outputEncoding, timeout)[0]; 184 | } 185 | 186 | 187 | /** 188 | * 获得cookie的Get请求 189 | * 190 | * @param URL 服务器地址 191 | * @param requestProperty 请求头 192 | * @param outputEncoding 响应编码 193 | * @param timeout 超时时间 194 | * @return 字符串数组,第一个元素是响应数据,第二个是返回的cookie 195 | * @throws IOException 网络错误 196 | */ 197 | public static String[] getAndGetCookie(String URL, Map requestProperty, String outputEncoding, int timeout) throws IOException { 198 | return action("GET", null, URL, requestProperty, null, "null", outputEncoding, timeout); 199 | } 200 | 201 | /** 202 | * 需要cookie的Get请求 203 | * 204 | * @param URL 服务器地址 205 | * @param requestProperty 请求头 206 | * @param cookie 发送的cookie 207 | * @param outputEncoding 响应编码 208 | * @param timeout 超时时间 209 | * @return 响应数据 210 | * @throws IOException 网络错误 211 | */ 212 | public static String getWithCookie(String URL, Map requestProperty, String cookie, String outputEncoding, int timeout) throws IOException { 213 | return action("GET", null, URL, requestProperty, cookie, null, outputEncoding, timeout)[0]; 214 | } 215 | 216 | /** 217 | * POST请求(不含cookie) 218 | * 219 | * @param URL 服务器地址 220 | * @param requestProperty 请求头 221 | * @param outputEncoding 响应编码 222 | * @param timeout 超时时间 223 | * @return 响应数据 224 | * @throws IOException 网络错误 225 | */ 226 | public static String get(String URL, Map requestProperty, String outputEncoding, int timeout) throws IOException { 227 | return action("GET", null, URL, requestProperty, null, null, outputEncoding, timeout)[0]; 228 | } 229 | 230 | public static int getReturnCode(String action, String postData, String URL, Map requestProperty, String inputEncoding, String outputEncoding, int timeout) { 231 | try { 232 | byte[] postAsBytes = new byte[]{}; 233 | if (postData != null) { 234 | postAsBytes = postData.getBytes(inputEncoding); 235 | } 236 | java.net.URL url = new URL(URL); 237 | HttpURLConnection connection = (HttpURLConnection) url 238 | .openConnection(); 239 | connection.setConnectTimeout(timeout); 240 | connection.setDoOutput(true); 241 | connection.setRequestMethod(action); 242 | connection.setUseCaches(false); 243 | /* java 1.6 does not support 244 | requestProperty.forEach((k,v) -> connection.setRequestProperty(k, v)); 245 | */ 246 | if (requestProperty != null) { 247 | for (Map.Entry entry : requestProperty.entrySet()) { 248 | connection.setRequestProperty(entry.getKey(), entry.getValue()); 249 | } 250 | } 251 | connection.setRequestProperty("Content-Length", String.valueOf(postAsBytes.length)); 252 | connection.connect(); 253 | int code = connection.getResponseCode(); 254 | connection.disconnect(); 255 | return code; 256 | } catch (UnsupportedEncodingException e) { 257 | System.out.println(e); 258 | return -1; 259 | } catch (MalformedURLException malformedURLException) { 260 | System.out.println(malformedURLException); 261 | return -2; 262 | } catch (ProtocolException protocolException) { 263 | System.out.println(protocolException); 264 | return -3; 265 | } catch (IOException ioException) { 266 | System.out.println(ioException); 267 | return -4; 268 | 269 | } 270 | } 271 | } 272 | -------------------------------------------------------------------------------- /src/main/java/utils/network/ReturnData.java: -------------------------------------------------------------------------------- 1 | package utils.network; 2 | 3 | import java.util.List; 4 | import java.util.Map; 5 | 6 | /** 7 | * Created by padeoe on 2016/5/12. 8 | */ 9 | public class ReturnData { 10 | byte[] data; 11 | Map> headers; 12 | 13 | public ReturnData(byte[] data, Map> headers) { 14 | this.data = data; 15 | this.headers = headers; 16 | } 17 | 18 | public byte[] getData() { 19 | return data; 20 | } 21 | 22 | public Map> getHeaders() { 23 | return headers; 24 | } 25 | } --------------------------------------------------------------------------------