├── WeReadScan ├── __init__.py ├── script │ ├── __init__.py │ ├── os_util.py │ └── png2pdf.py └── WeRead.py ├── requirements.txt ├── example ├── sample.png └── demo.py ├── .gitignore └── README.md /WeReadScan/__init__.py: -------------------------------------------------------------------------------- 1 | from .WeRead import WeRead -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | pillow 2 | numpy 3 | matplotlib 4 | opencv-python 5 | selenium -------------------------------------------------------------------------------- /example/sample.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Algebra-FUN/WeReadScan/HEAD/example/sample.png -------------------------------------------------------------------------------- /WeReadScan/script/__init__.py: -------------------------------------------------------------------------------- 1 | from .png2pdf import img2pdf, png2bmp 2 | from .os_util import dir_check, os_start_file, clear_temp, escape 3 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | __pycache__ 2 | test* 3 | setup*.py 4 | wrs-temp/ 5 | *.pdf 6 | dist/ 7 | build/ 8 | *egg-info/ 9 | temp*.py 10 | *.bat 11 | test 12 | .vscode -------------------------------------------------------------------------------- /WeReadScan/script/os_util.py: -------------------------------------------------------------------------------- 1 | ''' 2 | os_util.py 3 | Copyright 2020 by Algebra-FUN 4 | ALL RIGHTS RESERVED. 5 | ''' 6 | 7 | import os 8 | import shutil 9 | import re 10 | 11 | 12 | def dir_check(dir): 13 | try: 14 | os.makedirs(f'{os.getcwd()}/{dir}') 15 | except FileExistsError: 16 | pass 17 | 18 | def os_start_file(file_name): 19 | os.system(f'start {file_name}') 20 | 21 | def clear_temp(file_name): 22 | shutil.rmtree(file_name) 23 | 24 | def escape(file_name): 25 | return re.sub(r"[\/\\\:\*\?\"\<\>\|]",'_',file_name) -------------------------------------------------------------------------------- /WeReadScan/script/png2pdf.py: -------------------------------------------------------------------------------- 1 | ''' 2 | png2pdf.py 3 | Copyright 2020 by Algebra-FUN 4 | ALL RIGHTS RESERVED. 5 | ''' 6 | 7 | 8 | import cv2 9 | import numpy as np 10 | from PIL import Image 11 | 12 | 13 | def png2bmp(file_path, binary_threshold=200): 14 | img = cv2.imdecode(np.fromfile(f'{file_path}.png', dtype=np.uint8), -1) 15 | gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY) 16 | retval, dst = cv2.threshold(gray, binary_threshold, 255, cv2.THRESH_BINARY) 17 | return Image.fromarray(dst) 18 | 19 | 20 | def img2pdf(file_name, jpg_name_list, binary_threshold=200, quality=100): 21 | image_list = [png2bmp(path, binary_threshold=binary_threshold) 22 | for path in jpg_name_list] 23 | image_list[0].save(f'{file_name}.pdf', save_all=True, 24 | append_images=image_list[1:], resolution=quality) 25 | -------------------------------------------------------------------------------- /example/demo.py: -------------------------------------------------------------------------------- 1 | """ 2 | demo.py 3 | The demo of WeReadScan.py 4 | Copyright 2020 by Algebra-FUN 5 | ALL RIGHTS RESERVED. 6 | """ 7 | 8 | 9 | from selenium.webdriver import Chrome, ChromeOptions 10 | 11 | from WeReadScan import WeRead 12 | 13 | # options 14 | chrome_options = ChromeOptions() 15 | chrome_options.add_argument('--headless') #! important argument 16 | chrome_options.add_argument("--disable-blink-features=AutomationControlled") 17 | chrome_options.add_argument('disable-infobars') 18 | chrome_options.add_argument('log-level=3') 19 | 20 | # launch Webdriver 21 | print('Webdriver launching...') 22 | driver = Chrome(options=chrome_options) 23 | print('Webdriver launched.') 24 | 25 | with WeRead(driver,debug=True) as weread: 26 | weread.login() #? login for grab the whole book 27 | weread.scan2pdf('https://weread.qq.com/web/reader/60b32c107207bc8960bd9cekecc32f3013eccbc87e4b62e') -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # WeReadScan 2 | 3 | ![GitHub last commit](https://img.shields.io/github/last-commit/Algebra-FUN/WeReadScan) ![GitHub code size in bytes](https://img.shields.io/github/languages/code-size/Algebra-FUN/WeReadScan) ![GitHub top language](https://img.shields.io/github/languages/top/Algebra-FUN/WeReadScan) [![pip](https://img.shields.io/badge/pip-0.8.7-orange)](https://pypi.org/project/WeReadScan/) 4 | 5 | ## About 6 | 7 | 一个用于的将`微信读书`上的图书扫描转换本地PDF/HTML的爬虫库. 8 | 9 | ### 谈谈为何而开发 10 | 11 | 不得不说,“微信读书”是一个很好的平台。但是美中不足很明显,用户购买了图书资源,但是只能在“微信读书”的Application中阅读或者做一些文字批注╮(╯▽╰)╭,这些功能相较于购买的纸质书籍显然是不足的。比如,作者就习惯于用iPad的相关notebook类app做笔记,而“微信读书”并没有适配pencil做handwriting笔记的功能。 12 | 13 | 因此,既然“微信读书”没有提供,那只好自己解决了。于是,经过2天的开发,终于有了这个爬虫脚本,也可以开心地做手写笔记了o(_ ̄▽ ̄_)ブ 14 | 15 | ### 相关版本 16 | 17 | 在[Sec-ant](https://github.com/Sec-ant)的建议下,参考了他的解决方案[weread-scraper](https://github.com/Sec-ant/weread-scraper),将其中最重要的获取#preRenderContent的部分脚本进行整合,得到了[WeReadScan-HTML](https://github.com/Algebra-FUN/WeReadScan/tree/html-variant)版本,可以直接自动化获得多本图书的HTML,更加高效。 18 | 19 | ## Get started 20 | 21 | ```shell 22 | pip install WeReadScan-HTML 23 | ``` 24 | 25 | > 本项目需要使用selenium,需要对selenium具备基础的了解 26 | 27 | ### Demo 28 | 29 | 话不多说,直接上代码 30 | 31 | ```python 32 | from selenium.webdriver import Edge 33 | from selenium.webdriver.chrome.service import Service 34 | from selenium.webdriver.edge.options import Options 35 | 36 | from WeReadScan import WeRead 37 | 38 | options = Options() 39 | options.add_argument("--disable-blink-features=AutomationControlled") 40 | options.add_argument('disable-infobars') 41 | options.add_argument('log-level=3') 42 | options.add_argument("headless") 43 | 44 | # launch Webdriver 45 | print('Webdriver launching...') 46 | driver = Edge(options=options) 47 | # driver = Edge(service=service, options=options) 48 | print('Webdriver launched.') 49 | 50 | with WeRead(driver,debug=True) as weread: 51 | weread.login() #? login for grab the whole book 52 | weread.scan2html('https://weread.qq.com/web/reader/2c632ef071a486a92c60226kc81322c012c81e728d9d180') 53 | weread.scan2html('https://weread.qq.com/web/reader/a9c32f40717db77aa9c9171kc81322c012c81e728d9d180') 54 | ``` 55 | 56 | 扫描结果样例: 57 | 58 | ![](https://github.com/Algebra-FUN/WeReadScan/blob/master/example/sample.png?raw=true) 59 | 60 | 几点说明: 61 | 62 | 1. webdriver 需要 `无头(headless)` 模式启动 63 | 2. 只有登陆后,才能扫描完整的图书资源;若不登陆,也可以扫描部分无需解锁的部分 64 | 65 | ## API Reference 66 | 67 | ### WeRead 68 | 69 | WeReadScan.WeRead(headless_driver) 70 | 71 | `微信读书`网页代理,用于图书扫描 72 | 73 | #### Args 74 | 75 | - headless_driver: 设置了headless的Webdriver示例 76 | 77 | #### Returns 78 | 79 | - WeReadInstance 80 | 81 | #### Usage 82 | 83 | ```python 84 | chrome_options = ChromeOptions() 85 | chrome_options.add_argument('--headless') 86 | headless_driver = Chrome(chrome_options=chrome_options) 87 | weread = WeRead(headless_driver) 88 | ``` 89 | 90 | ### Login 91 | 92 | WeReadScan.WeRead.login(wait_turns=15) 93 | 94 | 展示二维码以登陆微信读书 95 | 96 | #### Args 97 | 98 | - wait_turns: 登陆二维码等待扫描的等待轮数 99 | 100 | #### Usage 101 | 102 | ```python 103 | weread.login() 104 | ``` 105 | 106 | ### Scan2html 107 | 108 | WeReadScan.WeRead.scan2html(book_url, save_at='.', show_output=True) 109 | 110 | 扫面`微信读书`的书籍转换为PDF并保存本地 111 | 112 | #### Args 113 | 114 | | 参数名 | 类型 | 默认值 | 描述 | 115 | | ---------------- | ---- | ---- | --------------------- | 116 | | book_url | str | 必填 | 扫描目标书籍的URL | 117 | | save_at | str | '.' | 保存地址 | 118 | | show_output | bool | True | 是否在该方法函数结束时展示生成的PDF文件 | 119 | 120 | #### Usage 121 | 122 | ```python 123 | weread.scan2html('https://weread.qq.com/web/reader/a57325c05c8ed3a57224187kc81322c012c81e728d9d180') 124 | ``` 125 | 126 | ## Disclaimer 127 | 128 | - 本脚本仅限用于**已购**图书的爬取,用于私人学习目的,禁止用于商业目的和网上资源扩散,尊重微信读书方面的利益 129 | - 若User使用该脚本用于不当的目的,责任由使用者承担,作者概不负责 130 | 131 | ## Stargazers over time 132 | 133 | [![Stargazers over time](https://starchart.cc/Algebra-FUN/WeReadScan.svg)](https://starchart.cc/Algebra-FUN/WeReadScan) 134 | 135 | -------------------------------------------------------------------------------- /WeReadScan/WeRead.py: -------------------------------------------------------------------------------- 1 | ''' 2 | WeRead.py 3 | Copyright 2020 by Algebra-FUN 4 | ALL RIGHTS RESERVED. 5 | ''' 6 | 7 | from matplotlib import pyplot as plt 8 | from PIL import Image 9 | from selenium.webdriver.common.by import By 10 | from selenium.webdriver.support.wait import WebDriverWait 11 | from selenium.webdriver.remote.webdriver import WebDriver 12 | 13 | from .script import img2pdf, dir_check, os_start_file, clear_temp, escape 14 | 15 | from time import sleep 16 | 17 | 18 | class WeRead: 19 | """ 20 | The agency who control `WeRead` web page with selenium webdriver to processing book scanning. 21 | 22 | `微信读书`网页代理,用于图书扫描 23 | 24 | :Args: 25 | - headless_driver: 26 | Webdriver instance with headless option set. 27 | 设置了headless的Webdriver示例 28 | 29 | :Returns: 30 | - WeReadInstance 31 | 32 | :Usage: 33 | chrome_options = ChromeOptions() 34 | chrome_options.add_argument('--headless') 35 | 36 | headless_driver = Chrome(chrome_options=chrome_options) 37 | 38 | weread = WeRead(headless_driver) 39 | """ 40 | 41 | def __init__(self, headless_driver: WebDriver, patience=30, debug=False): 42 | headless_driver.get('https://weread.qq.com/') 43 | headless_driver.implicitly_wait(5) 44 | self.driver: WebDriver = headless_driver 45 | self.patience = patience 46 | self.debug_mode = debug 47 | 48 | def __enter__(self): 49 | return self 50 | 51 | def __exit__(self, *args): 52 | if not self.debug_mode: 53 | clear_temp('wrs-temp') 54 | 55 | def S(self, selector): 56 | return WebDriverWait(self.driver, self.patience).until(lambda driver: driver.find_element(By.CSS_SELECTOR, selector)) 57 | 58 | def click(self, target): 59 | self.driver.execute_script('arguments[0].click();', target) 60 | 61 | def shot_full_canvas_context(self, file_name): 62 | renderTargetContainer = self.S('.renderTargetContainer') 63 | height = renderTargetContainer.get_property('offsetHeight') 64 | height += renderTargetContainer.get_property('offsetTop') 65 | width = self.driver.execute_script("return window.outerWidth") 66 | self.driver.set_window_size(width, height) 67 | sleep(1) 68 | content = self.S('.app_content') 69 | content.screenshot(file_name) 70 | 71 | def check_all_image_loaded(self, frequency=10, max_wait_duration=30): 72 | """ 73 | check if all image is loaded. 74 | 75 | 检查图书Image是否全部加载完毕. 76 | """ 77 | interval = 1/frequency 78 | 79 | try: 80 | img_unloaded = WebDriverWait(self.driver, 3).until( 81 | lambda driver: driver.find_elements(By.CSS_SELECTOR, 'img.wr_absolute')) 82 | except Exception: 83 | return False 84 | 85 | for _ in range(frequency*max_wait_duration): 86 | sleep(interval) 87 | for img in img_unloaded: 88 | if img.get_property('complete'): 89 | img_unloaded.remove(img) 90 | if not len(img_unloaded): 91 | self.debug_mode and print('all image is loaded!') 92 | return True 93 | return False 94 | 95 | def login(self, wait_turns=15): 96 | """ 97 | show QRCode to login weread. 98 | 99 | 展示二维码以登陆微信读书 100 | 101 | :Args: 102 | - wait_turns: 103 | Loop turns wait for scanning QRCode 104 | 登陆二维码等待扫描的等待轮数 105 | 106 | :Usage: 107 | weread.login() 108 | """ 109 | 110 | dir_check('wrs-temp') 111 | 112 | # get QRCode for Login 113 | self.S('button.navBar_link_Login').click() 114 | self.S('.login_dialog_qrcode>img').screenshot( 115 | 'wrs-temp/login_qrcode.png') 116 | 117 | login_qrcode = Image.open('wrs-temp/login_qrcode.png') 118 | plt.ion() 119 | plt.title('Scan this QRCode to Login.') 120 | plt.imshow(login_qrcode) 121 | plt.show() 122 | plt.pause(.001) 123 | 124 | # wair for QRCode Scan 125 | for i in range(wait_turns): 126 | print(f'Wait for QRCode Scan...{i}/{wait_turns}turns') 127 | try: 128 | self.driver.find_element(By.CSS_SELECTOR, '.menu_container') 129 | print('Login Succeed.') 130 | break 131 | except Exception: 132 | plt.pause(1) 133 | else: 134 | raise Exception('WeRead.Timeout: Login timeout.') 135 | 136 | # close QRCode Window 137 | plt.ioff() 138 | plt.close() 139 | 140 | def switch_to_context(self): 141 | """switch to main body of the book""" 142 | self.S('button.catalog').click() 143 | self.S('li.chapterItem:nth-child(2)').click() 144 | 145 | def set_font_size(self, font_size_index=1): 146 | """ 147 | set font size 148 | 149 | 设置字体大小 150 | 151 | :Args: 152 | - font_size_index=0: 153 | the index of font size(1-7) 154 | 字体大小级别(1-7) 155 | In particular, 1 represents minimize, 7 represents maximize 156 | 特别地,1为最小,7为最大 157 | """ 158 | sleep(1) 159 | self.S('button.fontSizeButton').click() 160 | sleep(1) 161 | self.S(f'.vue-slider-mark:nth-child({font_size_index})').click() 162 | self.S('.app_content').click() 163 | 164 | def turn_light_on(self): 165 | sleep(1) 166 | self.S('button.readerControls_item.white').click() 167 | 168 | def scan2pdf(self, book_url, save_at='.', binary_threshold=200, quality=100, show_output=True, font_size_index=1): 169 | """ 170 | scan `weread` book to pdf and save offline. 171 | 172 | 扫面`微信读书`的书籍转换为PDF并保存本地 173 | 174 | :Args: 175 | - book_url: 176 | the url of weread book which aimed to scan 177 | 扫描目标书籍的ULR 178 | - save_at='.': 179 | the path of where to save 180 | 保存地址 181 | - binary_threshold=200: 182 | threshold of scan binary 183 | 二值化处理的阈值 184 | - quality=95: 185 | quality of scan pdf 186 | 扫描PDF的质量 187 | - show_output=True: 188 | if show the output pdf file at the end of this method 189 | 是否在该方法函数结束时展示生成的PDF文件 190 | - font_size_index=1: 191 | the index of font size(1-7) 192 | 字体大小级别(1-7) 193 | In particular, 1 represents minimize, 7 represents maximize 194 | 特别地,1为最小,7为最大 195 | 196 | :Usage: 197 | weread.scan2pdf('https://weread.qq.com/web/reader/a57325c05c8ed3a57224187kc81322c012c81e728d9d180') 198 | """ 199 | print('Task launching...') 200 | 201 | # valid the url 202 | if 'https://weread.qq.com/web/reader/' not in book_url: 203 | raise Exception('WeRead.UrlError: Wrong url format.') 204 | 205 | # switch to target book url 206 | self.driver.get(book_url) 207 | print(f'navigate to {book_url}') 208 | 209 | # turn theme to light theme 210 | self.turn_light_on() 211 | 212 | # set font size 213 | self.set_font_size(font_size_index) 214 | 215 | # switch to target book's cover 216 | self.switch_to_context() 217 | 218 | # get the name of the book 219 | book_name = escape(self.S('span.readerTopBar_title_link').text) 220 | print(f'preparing to scan "{book_name}"') 221 | 222 | # check the dir for future save 223 | dir_check(f'wrs-temp/{book_name}/context') 224 | 225 | # used to store png_name for pdf converting 226 | png_name_list = [] 227 | 228 | page = 1 229 | 230 | while True: 231 | sleep(1) 232 | 233 | # get chapter 234 | chapter = escape(self.S('span.readerTopBar_title_chapter').text) 235 | print(f'scanning chapter "{chapter}"') 236 | 237 | # locate the renderTargetContent 238 | context = self.S('div.app_content') 239 | 240 | # check all image loaded 241 | self.check_all_image_loaded() 242 | 243 | # context_scan2png 244 | png_name = f'wrs-temp/{book_name}/context/{chapter}_{page}' 245 | self.shot_full_canvas_context(f'{png_name}.png') 246 | 247 | png_name_list.append(png_name) 248 | print(f'save chapter scan {png_name}') 249 | 250 | # find next page or chapter button 251 | try: 252 | readerFooter = self.S( 253 | '.readerFooter_button,.readerFooter_ending') 254 | except Exception: 255 | break 256 | 257 | readerFooterClass = readerFooter.get_attribute('class') 258 | 259 | # quick ending 260 | if 'ending' in readerFooterClass: 261 | break 262 | 263 | next_btn_text = readerFooter.text.strip() 264 | 265 | if next_btn_text == "下一页": 266 | print("go to next page") 267 | page += 1 268 | elif next_btn_text == "下一章": 269 | print("go to next chapter") 270 | page = 1 271 | else: 272 | raise Exception("Unexpected Exception") 273 | 274 | # go to next page or chapter 275 | readerFooter.click() 276 | 277 | print('pdf converting...') 278 | 279 | # convert to pdf and save offline 280 | img2pdf(f'{save_at}/{book_name}', png_name_list, 281 | binary_threshold=binary_threshold, quality=quality) 282 | print('scanning finished.') 283 | if show_output: 284 | os_start_file(f'{save_at}/{book_name}.pdf') 285 | --------------------------------------------------------------------------------