├── LICENSE
├── README.md
└── wereadScraper.user.js


/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2022 Ze-Zheng Wu
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # weread-scraper
 2 | Export Weread books to html files
 3 | 
 4 | <img src="https://user-images.githubusercontent.com/10386119/186714588-97e1b755-ce62-4f89-a64d-268824f39e9e.png" width=480/>
 5 | 
 6 | ## Steps
 7 | 
 8 | - Add [this userscript](https://greasyfork.org/zh-CN/scripts/450169-weread-scraper) in [Tampermonkey](https://www.tampermonkey.net/)
 9 | - Select the book you want to save in Weread, e.g. https://weread.qq.com/web/bookDetail/f6432a905b73c0f64797a8d
10 | - Browse the first page of this book or any other page where you want to start to save, e.g. https://weread.qq.com/web/reader/f6432a905b73c0f64797a8dkc81322c012c81e728d9d180
11 | - Left click Tampermonkey icon and then click the "Start Scraping" button in the popped up menu
12 | - Wait for scraping to complete and an HTML file will be automatically generated and downloaded
13 | - You can cancel an ongoing scraping process by clicking "Cancel Scraping"
14 | - You can stop an ongoing scraping process and save the available scraped contents by clicking "Stop Scraping & Save"
15 | - You can set a next-page-click interval by clicking "Set Click Interval" and input the desired value (in milliseconds) 
16 | - That's it!
17 | 
18 | ## Notes
19 | 
20 | - Chrome and MS Edge are recommended to run this script
21 | - Firefox is recommended to print the downloaded HTML to PDF
22 | - Many of the books use these fonts, install them to have a better reading experience: [汉仪旗黑 50S](https://www.hanyi.com.cn/productdetail?id=831), [汉仪旗黑 65S](https://www.hanyi.com.cn/productdetail.php?id=834) and [汉仪楷体S](https://www.hanyi.com.cn/productdetail.php?id=814). There might be more fonts  but I didn't check them all. If you know, feel free to create issues. Btw, `PingFang SC` would be a good choice to act as a fallback font
23 | - Scraping books with many many pages can crush your browser, cuz I didn't use any mechanism of streaming, chunking or garbage collection. Further work can be done to fix this
24 | 
25 | Enjoy and please don't use this script for pirating books and selling them!
26 | 


--------------------------------------------------------------------------------
/wereadScraper.user.js:
--------------------------------------------------------------------------------
  1 | // ==UserScript==
  2 | // @name         Weread Scraper
  3 | // @namespace    https://github.com/Sec-ant/weread-scraper
  4 | // @version      0.4
  5 | // @description  Export Weread books to html file
  6 | // @author       Secant
  7 | // @match        https://weread.qq.com/web/reader/*
  8 | // @icon         https://weread.qq.com/favicon.ico
  9 | // @grant        GM_registerMenuCommand
 10 | // @grant        GM_setValue
 11 | // @grant        GM_getValue
 12 | // @run-at       document-start
 13 | // ==/UserScript==
 14 | 
 15 | (async function () {
 16 |   "use strict";
 17 |   // interactive
 18 |   GM_registerMenuCommand("Start Scraping", startScraping);
 19 |   GM_registerMenuCommand("Cancel Scraping", cancelScraping);
 20 |   GM_registerMenuCommand("Stop Scraping & Save", stopScrapingAndSave);
 21 |   GM_registerMenuCommand("Set Click Interval", setClickInterval);
 22 | 
 23 |   // construct html root
 24 |   const rootElement = document.createElement("html");
 25 |   const styleElement = document.createElement("style");
 26 |   const bodyElement = document.createElement("body");
 27 |   rootElement.append(styleElement);
 28 |   rootElement.append(bodyElement);
 29 | 
 30 |   // initialize flags
 31 |   const scrapeFlag = JSON.parse(
 32 |     sessionStorage.getItem("scrapeFlag") || "false"
 33 |   );
 34 |   let contentFound = false;
 35 |   let timeoutIsSet = false;
 36 |   let abortTimeout = false;
 37 | 
 38 |   // define observer handlers
 39 |   const contentObserver = new MutationObserver(async (_, observer) => {
 40 |     const content = document.querySelector(".preRenderContainer:not([style])");
 41 |     if (!contentFound && content) {
 42 |       // define styles
 43 |       if (styleElement.childNodes.length === 0) {
 44 |         const contentStyle = content.querySelector("style");
 45 |         if (contentStyle?.childNodes.length) {
 46 |           styleElement.innerHTML = contentStyle.innerHTML
 47 |             .replaceAll(".readerChapterContent", ".preRenderContent")
 48 |             .replaceAll(/汉仪旗黑(?=\d)/g, "汉仪旗黑 ")
 49 |             .replaceAll(/汉仪楷体(?!S)/g, "汉仪楷体S");
 50 |           styleElement.append(
 51 |             ".preRenderContent { page-break-after: always; }"
 52 |           );
 53 |         }
 54 |       }
 55 |       // append contents
 56 |       const contentDiv = content.querySelector("#preRenderContent");
 57 |       if (contentDiv) {
 58 |         contentDiv.removeAttribute("id");
 59 |         contentDiv
 60 |           .querySelectorAll("img")
 61 |           .forEach(
 62 |             (img) => (img.src = img.getAttribute("data-src") || img.src)
 63 |           );
 64 |         bodyElement.append(contentDiv.cloneNode(true));
 65 |         contentFound = true;
 66 |       }
 67 |     }
 68 | 
 69 |     // turn to next page
 70 |     const nextPage = document.querySelector(".readerFooter_button");
 71 |     if (contentFound && nextPage && !timeoutIsSet) {
 72 |       contentFound = false;
 73 |       timeoutIsSet = true;
 74 |       // sleep for {click interval} ms
 75 |       await new Promise((resolve) => {
 76 |         setTimeout(() => {
 77 |           resolve();
 78 |         }, getClickInterval());
 79 |       });
 80 |       timeoutIsSet = false;
 81 |       if (abortTimeout) {
 82 |         abortTimeout = false;
 83 |       } else {
 84 |         nextPage.dispatchEvent(
 85 |           new MouseEvent("click", {
 86 |             clientX: 1,
 87 |             clientY: 1,
 88 |           })
 89 |         );
 90 |       }
 91 |     }
 92 | 
 93 |     // complete
 94 |     const ending = document.querySelector(".readerFooter_ending");
 95 |     if (ending) {
 96 |       stopScrapingAndSave();
 97 |     }
 98 |   });
 99 | 
100 |   // start observation
101 |   if (scrapeFlag) {
102 |     contentObserver.observe(document.documentElement, {
103 |       childList: true,
104 |       subtree: true,
105 |     });
106 |   }
107 | 
108 |   // get click interval helper function
109 |   function getClickInterval() {
110 |     return GM_getValue("clickInterval", 0);
111 |   }
112 | 
113 |   // menu functions
114 |   function stopScrapingAndSave() {
115 |     sessionStorage.setItem("scrapeFlag", "false");
116 |     contentObserver.disconnect();
117 |     const docBlob = new Blob([rootElement.outerHTML], {
118 |       type: "text/html;charset=utf-8;",
119 |     });
120 |     const dummyLink = document.createElement("a");
121 |     dummyLink.href = URL.createObjectURL(docBlob);
122 |     const bookTitle = document
123 |       .querySelector(".readerCatalog_bookInfo_title_txt")
124 |       .textContent.trim();
125 |     dummyLink.download = `${bookTitle}.html`;
126 |     document.body.appendChild(dummyLink);
127 |     dummyLink.click();
128 |     document.body.removeChild(dummyLink);
129 |     URL.revokeObjectURL(dummyLink.href);
130 |     styleElement.innerHTML = "";
131 |     bodyElement.innerHTML = "";
132 |     contentFound = false;
133 |     timeoutIsSet = false;
134 |     abortTimeout = true;
135 |   }
136 | 
137 |   function startScraping() {
138 |     sessionStorage.setItem("scrapeFlag", "true");
139 |     window.location.reload();
140 |   }
141 | 
142 |   function cancelScraping() {
143 |     sessionStorage.setItem("scrapeFlag", "false");
144 |     window.location.reload();
145 |   }
146 | 
147 |   function setClickInterval() {
148 |     const prevClickInterval = getClickInterval();
149 |     let newClickInterval;
150 |     try {
151 |       newClickInterval = parseFloat(
152 |         window.prompt("Click interval (ms):", prevClickInterval)
153 |       );
154 |       if (!Number.isFinite(newClickInterval) || newClickInterval < 0) {
155 |         throw undefined;
156 |       }
157 |     } catch (e) {
158 |       newClickInterval = prevClickInterval;
159 |     }
160 |     GM_setValue("clickInterval", newClickInterval);
161 |   }
162 | })();
163 | 


--------------------------------------------------------------------------------