├── WeReadScan
    ├── __init__.py
    ├── script
    │   ├── __init__.py
    │   ├── os_util.py
    │   └── png2pdf.py
    └── WeRead.py
├── requirements.txt
├── example
    ├── sample.png
    └── demo.py
├── .gitignore
└── README.md


/WeReadScan/__init__.py:
--------------------------------------------------------------------------------
1 | from .WeRead import WeRead


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | pillow
2 | numpy
3 | matplotlib
4 | opencv-python
5 | selenium


--------------------------------------------------------------------------------
/example/sample.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Algebra-FUN/WeReadScan/HEAD/example/sample.png


--------------------------------------------------------------------------------
/WeReadScan/script/__init__.py:
--------------------------------------------------------------------------------
1 | from .png2pdf import img2pdf, png2bmp
2 | from .os_util import dir_check, os_start_file, clear_temp, escape
3 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | __pycache__
 2 | test*
 3 | setup*.py
 4 | wrs-temp/
 5 | *.pdf
 6 | dist/
 7 | build/
 8 | *egg-info/
 9 | temp*.py
10 | *.bat
11 | test
12 | .vscode


--------------------------------------------------------------------------------
/WeReadScan/script/os_util.py:
--------------------------------------------------------------------------------
 1 | '''
 2 | os_util.py
 3 | Copyright 2020 by Algebra-FUN
 4 | ALL RIGHTS RESERVED.
 5 | '''
 6 | 
 7 | import os
 8 | import shutil
 9 | import re
10 | 
11 | 
12 | def dir_check(dir):
13 |     try:
14 |         os.makedirs(f'{os.getcwd()}/{dir}')
15 |     except FileExistsError:
16 |         pass
17 | 
18 | def os_start_file(file_name):
19 |     os.system(f'start {file_name}')
20 | 
21 | def clear_temp(file_name):
22 |     shutil.rmtree(file_name)
23 | 
24 | def escape(file_name):
25 |     return re.sub(r"[\/\\\:\*\?\"\<\>\|]",'_',file_name)


--------------------------------------------------------------------------------
/WeReadScan/script/png2pdf.py:
--------------------------------------------------------------------------------
 1 | '''
 2 | png2pdf.py
 3 | Copyright 2020 by Algebra-FUN
 4 | ALL RIGHTS RESERVED.
 5 | '''
 6 | 
 7 | 
 8 | import cv2
 9 | import numpy as np
10 | from PIL import Image
11 | 
12 | 
13 | def png2bmp(file_path, binary_threshold=200):
14 |     img = cv2.imdecode(np.fromfile(f'{file_path}.png', dtype=np.uint8), -1)
15 |     gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
16 |     retval, dst = cv2.threshold(gray, binary_threshold, 255, cv2.THRESH_BINARY)
17 |     return Image.fromarray(dst)
18 | 
19 | 
20 | def img2pdf(file_name, jpg_name_list, binary_threshold=200, quality=100):
21 |     image_list = [png2bmp(path, binary_threshold=binary_threshold)
22 |                   for path in jpg_name_list]
23 |     image_list[0].save(f'{file_name}.pdf', save_all=True,
24 |                        append_images=image_list[1:], resolution=quality)
25 | 


--------------------------------------------------------------------------------
/example/demo.py:
--------------------------------------------------------------------------------
 1 | """
 2 | demo.py
 3 | The demo of WeReadScan.py
 4 | Copyright 2020 by Algebra-FUN
 5 | ALL RIGHTS RESERVED.
 6 | """
 7 | 
 8 | 
 9 | from selenium.webdriver import Chrome, ChromeOptions
10 | 
11 | from WeReadScan import WeRead
12 | 
13 | # options
14 | chrome_options = ChromeOptions()
15 | chrome_options.add_argument('--headless')  #! important argument
16 | chrome_options.add_argument("--disable-blink-features=AutomationControlled")
17 | chrome_options.add_argument('disable-infobars')
18 | chrome_options.add_argument('log-level=3')
19 | 
20 | # launch Webdriver
21 | print('Webdriver launching...')
22 | driver = Chrome(options=chrome_options)
23 | print('Webdriver launched.')
24 | 
25 | with WeRead(driver,debug=True) as weread:
26 |     weread.login() #? login for grab the whole book
27 |     weread.scan2pdf('https://weread.qq.com/web/reader/60b32c107207bc8960bd9cekecc32f3013eccbc87e4b62e')


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # WeReadScan
  2 | 
  3 | ![GitHub last commit](https://img.shields.io/github/last-commit/Algebra-FUN/WeReadScan) ![GitHub code size in bytes](https://img.shields.io/github/languages/code-size/Algebra-FUN/WeReadScan) ![GitHub top language](https://img.shields.io/github/languages/top/Algebra-FUN/WeReadScan) [![pip](https://img.shields.io/badge/pip-0.8.7-orange)](https://pypi.org/project/WeReadScan/)
  4 | 
  5 | ## About
  6 | 
  7 | 一个用于的将`微信读书`上的图书扫描转换本地PDF/HTML的爬虫库.
  8 | 
  9 | ### 谈谈为何而开发
 10 | 
 11 | 不得不说，“微信读书”是一个很好的平台。但是美中不足很明显，用户购买了图书资源，但是只能在“微信读书”的Application中阅读或者做一些文字批注╮(╯▽╰)╭，这些功能相较于购买的纸质书籍显然是不足的。比如，作者就习惯于用iPad的相关notebook类app做笔记，而“微信读书”并没有适配pencil做handwriting笔记的功能。
 12 | 
 13 | 因此，既然“微信读书”没有提供，那只好自己解决了。于是，经过2天的开发，终于有了这个爬虫脚本，也可以开心地做手写笔记了o(_￣▽￣_)ブ
 14 | 
 15 | ### 相关版本
 16 | 
 17 | 在[Sec-ant](https://github.com/Sec-ant)的建议下，参考了他的解决方案[weread-scraper](https://github.com/Sec-ant/weread-scraper)，将其中最重要的获取#preRenderContent的部分脚本进行整合，得到了[WeReadScan-HTML](https://github.com/Algebra-FUN/WeReadScan/tree/html-variant)版本，可以直接自动化获得多本图书的HTML，更加高效。
 18 | 
 19 | ## Get started
 20 | 
 21 | ```shell
 22 | pip install WeReadScan-HTML
 23 | ```
 24 | 
 25 | > 本项目需要使用selenium，需要对selenium具备基础的了解
 26 | 
 27 | ### Demo
 28 | 
 29 | 话不多说，直接上代码
 30 | 
 31 | ```python
 32 | from selenium.webdriver import Edge
 33 | from selenium.webdriver.chrome.service import Service
 34 | from selenium.webdriver.edge.options import Options
 35 | 
 36 | from WeReadScan import WeRead
 37 | 
 38 | options = Options()
 39 | options.add_argument("--disable-blink-features=AutomationControlled")
 40 | options.add_argument('disable-infobars')
 41 | options.add_argument('log-level=3')
 42 | options.add_argument("headless")
 43 | 
 44 | # launch Webdriver
 45 | print('Webdriver launching...')
 46 | driver = Edge(options=options)
 47 | # driver = Edge(service=service, options=options)
 48 | print('Webdriver launched.')
 49 | 
 50 | with WeRead(driver,debug=True) as weread:
 51 |     weread.login() #? login for grab the whole book
 52 |     weread.scan2html('https://weread.qq.com/web/reader/2c632ef071a486a92c60226kc81322c012c81e728d9d180')
 53 |     weread.scan2html('https://weread.qq.com/web/reader/a9c32f40717db77aa9c9171kc81322c012c81e728d9d180')
 54 | ```
 55 | 
 56 | 扫描结果样例：
 57 | 
 58 | ![](https://github.com/Algebra-FUN/WeReadScan/blob/master/example/sample.png?raw=true)
 59 | 
 60 | 几点说明：
 61 | 
 62 | 1.  webdriver 需要 `无头(headless)` 模式启动
 63 | 2.  只有登陆后，才能扫描完整的图书资源；若不登陆，也可以扫描部分无需解锁的部分
 64 | 
 65 | ## API Reference
 66 | 
 67 | ### WeRead
 68 | 
 69 | WeReadScan.WeRead(headless_driver)
 70 | 
 71 | `微信读书`网页代理，用于图书扫描
 72 | 
 73 | #### Args
 74 | 
 75 | -   headless_driver:	设置了headless的Webdriver示例
 76 | 
 77 | #### Returns
 78 | 
 79 | -   WeReadInstance
 80 | 
 81 | #### Usage
 82 | 
 83 | ```python
 84 | chrome_options = ChromeOptions()
 85 | chrome_options.add_argument('--headless')
 86 | headless_driver = Chrome(chrome_options=chrome_options)
 87 | weread = WeRead(headless_driver)
 88 | ```
 89 | 
 90 | ### Login
 91 | 
 92 | WeReadScan.WeRead.login(wait_turns=15)
 93 | 
 94 | 展示二维码以登陆微信读书
 95 | 
 96 | #### Args
 97 | 
 98 | -   wait_turns:	登陆二维码等待扫描的等待轮数
 99 | 
100 | #### Usage
101 | 
102 | ```python
103 | weread.login()
104 | ```
105 | 
106 | ### Scan2html
107 | 
108 | WeReadScan.WeRead.scan2html(book_url, save_at='.', show_output=True)
109 | 
110 | 扫面`微信读书`的书籍转换为PDF并保存本地
111 | 
112 | #### Args
113 | 
114 | | 参数名              | 类型   | 默认值  | 描述                    |
115 | | ---------------- | ---- | ---- | --------------------- |
116 | | book_url         | str  | 必填   | 扫描目标书籍的URL            |
117 | | save_at          | str  | '.'  | 保存地址                  |
118 | | show_output      | bool | True | 是否在该方法函数结束时展示生成的PDF文件 |
119 | 
120 | #### Usage
121 | 
122 | ```python
123 | weread.scan2html('https://weread.qq.com/web/reader/a57325c05c8ed3a57224187kc81322c012c81e728d9d180')
124 | ```
125 | 
126 | ## Disclaimer
127 | 
128 | -   本脚本仅限用于**已购**图书的爬取，用于私人学习目的，禁止用于商业目的和网上资源扩散，尊重微信读书方面的利益
129 | -   若User使用该脚本用于不当的目的，责任由使用者承担，作者概不负责
130 | 
131 | ## Stargazers over time
132 | 
133 | [![Stargazers over time](https://starchart.cc/Algebra-FUN/WeReadScan.svg)](https://starchart.cc/Algebra-FUN/WeReadScan)
134 |       
135 | 


--------------------------------------------------------------------------------
/WeReadScan/WeRead.py:
--------------------------------------------------------------------------------
  1 | '''
  2 | WeRead.py
  3 | Copyright 2020 by Algebra-FUN
  4 | ALL RIGHTS RESERVED.
  5 | '''
  6 | 
  7 | from matplotlib import pyplot as plt
  8 | from PIL import Image
  9 | from selenium.webdriver.common.by import By
 10 | from selenium.webdriver.support.wait import WebDriverWait
 11 | from selenium.webdriver.remote.webdriver import WebDriver
 12 | 
 13 | from .script import img2pdf, dir_check, os_start_file, clear_temp, escape
 14 | 
 15 | from time import sleep
 16 | 
 17 | 
 18 | class WeRead:
 19 |     """
 20 |         The agency who control `WeRead` web page with selenium webdriver to processing book scanning.
 21 | 
 22 |         `微信读书`网页代理，用于图书扫描
 23 | 
 24 |         :Args:
 25 |          - headless_driver:
 26 |                 Webdriver instance with headless option set.
 27 |                 设置了headless的Webdriver示例
 28 | 
 29 |         :Returns:
 30 |          - WeReadInstance
 31 | 
 32 |         :Usage:
 33 |             chrome_options = ChromeOptions()
 34 |             chrome_options.add_argument('--headless')
 35 | 
 36 |             headless_driver = Chrome(chrome_options=chrome_options)
 37 | 
 38 |             weread = WeRead(headless_driver)
 39 |     """
 40 | 
 41 |     def __init__(self, headless_driver: WebDriver, patience=30, debug=False):
 42 |         headless_driver.get('https://weread.qq.com/')
 43 |         headless_driver.implicitly_wait(5)
 44 |         self.driver: WebDriver = headless_driver
 45 |         self.patience = patience
 46 |         self.debug_mode = debug
 47 | 
 48 |     def __enter__(self):
 49 |         return self
 50 | 
 51 |     def __exit__(self, *args):
 52 |         if not self.debug_mode:
 53 |             clear_temp('wrs-temp')
 54 | 
 55 |     def S(self, selector):
 56 |         return WebDriverWait(self.driver, self.patience).until(lambda driver: driver.find_element(By.CSS_SELECTOR, selector))
 57 | 
 58 |     def click(self, target):
 59 |         self.driver.execute_script('arguments[0].click();', target)
 60 | 
 61 |     def shot_full_canvas_context(self, file_name):
 62 |         renderTargetContainer = self.S('.renderTargetContainer')
 63 |         height = renderTargetContainer.get_property('offsetHeight')
 64 |         height += renderTargetContainer.get_property('offsetTop')
 65 |         width = self.driver.execute_script("return window.outerWidth")
 66 |         self.driver.set_window_size(width, height)
 67 |         sleep(1)
 68 |         content = self.S('.app_content')
 69 |         content.screenshot(file_name)
 70 | 
 71 |     def check_all_image_loaded(self, frequency=10, max_wait_duration=30):
 72 |         """
 73 |         check if all image is loaded.
 74 | 
 75 |         检查图书Image是否全部加载完毕.
 76 |         """
 77 |         interval = 1/frequency
 78 | 
 79 |         try:
 80 |             img_unloaded = WebDriverWait(self.driver, 3).until(
 81 |                 lambda driver: driver.find_elements(By.CSS_SELECTOR, 'img.wr_absolute'))
 82 |         except Exception:
 83 |             return False
 84 | 
 85 |         for _ in range(frequency*max_wait_duration):
 86 |             sleep(interval)
 87 |             for img in img_unloaded:
 88 |                 if img.get_property('complete'):
 89 |                     img_unloaded.remove(img)
 90 |             if not len(img_unloaded):
 91 |                 self.debug_mode and print('all image is loaded!')
 92 |                 return True
 93 |         return False
 94 | 
 95 |     def login(self, wait_turns=15):
 96 |         """
 97 |         show QRCode to login weread.
 98 | 
 99 |         展示二维码以登陆微信读书
100 | 
101 |         :Args:
102 |          - wait_turns: 
103 |                 Loop turns wait for scanning QRCode
104 |                 登陆二维码等待扫描的等待轮数
105 | 
106 |         :Usage:
107 |             weread.login()
108 |         """
109 | 
110 |         dir_check('wrs-temp')
111 | 
112 |         # get QRCode for Login
113 |         self.S('button.navBar_link_Login').click()
114 |         self.S('.login_dialog_qrcode>img').screenshot(
115 |             'wrs-temp/login_qrcode.png')
116 | 
117 |         login_qrcode = Image.open('wrs-temp/login_qrcode.png')
118 |         plt.ion()
119 |         plt.title('Scan this QRCode to Login.')
120 |         plt.imshow(login_qrcode)
121 |         plt.show()
122 |         plt.pause(.001)
123 | 
124 |         # wair for QRCode Scan
125 |         for i in range(wait_turns):
126 |             print(f'Wait for QRCode Scan...{i}/{wait_turns}turns')
127 |             try:
128 |                 self.driver.find_element(By.CSS_SELECTOR, '.menu_container')
129 |                 print('Login Succeed.')
130 |                 break
131 |             except Exception:
132 |                 plt.pause(1)
133 |         else:
134 |             raise Exception('WeRead.Timeout: Login timeout.')
135 | 
136 |         # close QRCode Window
137 |         plt.ioff()
138 |         plt.close()
139 | 
140 |     def switch_to_context(self):
141 |         """switch to main body of the book"""
142 |         self.S('button.catalog').click()
143 |         self.S('li.chapterItem:nth-child(2)').click()
144 | 
145 |     def set_font_size(self, font_size_index=1):
146 |         """
147 |         set font size
148 | 
149 |         设置字体大小
150 | 
151 |         :Args:
152 |          - font_size_index=0:
153 |                 the index of font size(1-7)
154 |                 字体大小级别(1-7)
155 |                 In particular, 1 represents minimize, 7 represents maximize
156 |                 特别地，1为最小，7为最大
157 |         """
158 |         sleep(1)
159 |         self.S('button.fontSizeButton').click()
160 |         sleep(1)
161 |         self.S(f'.vue-slider-mark:nth-child({font_size_index})').click()
162 |         self.S('.app_content').click()
163 | 
164 |     def turn_light_on(self):
165 |         sleep(1)
166 |         self.S('button.readerControls_item.white').click()
167 | 
168 |     def scan2pdf(self, book_url, save_at='.', binary_threshold=200, quality=100, show_output=True, font_size_index=1):
169 |         """
170 |         scan `weread` book to pdf and save offline.
171 | 
172 |         扫面`微信读书`的书籍转换为PDF并保存本地
173 | 
174 |         :Args:
175 |          - book_url:
176 |                 the url of weread book which aimed to scan
177 |                 扫描目标书籍的ULR
178 |          - save_at='.':
179 |                 the path of where to save
180 |                 保存地址
181 |          - binary_threshold=200:
182 |                 threshold of scan binary
183 |                 二值化处理的阈值
184 |          - quality=95:
185 |                 quality of scan pdf
186 |                 扫描PDF的质量
187 |          - show_output=True:
188 |                 if show the output pdf file at the end of this method
189 |                 是否在该方法函数结束时展示生成的PDF文件
190 |          - font_size_index=1:
191 |                 the index of font size(1-7)
192 |                 字体大小级别(1-7)
193 |                 In particular, 1 represents minimize, 7 represents maximize
194 |                 特别地，1为最小，7为最大
195 | 
196 |         :Usage:
197 |             weread.scan2pdf('https://weread.qq.com/web/reader/a57325c05c8ed3a57224187kc81322c012c81e728d9d180')
198 |         """
199 |         print('Task launching...')
200 | 
201 |         # valid the url
202 |         if 'https://weread.qq.com/web/reader/' not in book_url:
203 |             raise Exception('WeRead.UrlError: Wrong url format.')
204 | 
205 |         # switch to target book url
206 |         self.driver.get(book_url)
207 |         print(f'navigate to {book_url}')
208 | 
209 |         # turn theme to light theme
210 |         self.turn_light_on()
211 | 
212 |         # set font size
213 |         self.set_font_size(font_size_index)
214 | 
215 |         # switch to target book's cover
216 |         self.switch_to_context()
217 | 
218 |         # get the name of the book
219 |         book_name = escape(self.S('span.readerTopBar_title_link').text)
220 |         print(f'preparing to scan "{book_name}"')
221 | 
222 |         # check the dir for future save
223 |         dir_check(f'wrs-temp/{book_name}/context')
224 | 
225 |         # used to store png_name for pdf converting
226 |         png_name_list = []
227 | 
228 |         page = 1
229 | 
230 |         while True:
231 |             sleep(1)
232 | 
233 |             # get chapter
234 |             chapter = escape(self.S('span.readerTopBar_title_chapter').text)
235 |             print(f'scanning chapter "{chapter}"')
236 | 
237 |             # locate the renderTargetContent
238 |             context = self.S('div.app_content')
239 | 
240 |             # check all image loaded
241 |             self.check_all_image_loaded()
242 | 
243 |             # context_scan2png
244 |             png_name = f'wrs-temp/{book_name}/context/{chapter}_{page}'
245 |             self.shot_full_canvas_context(f'{png_name}.png')
246 | 
247 |             png_name_list.append(png_name)
248 |             print(f'save chapter scan {png_name}')
249 | 
250 |             # find next page or chapter button
251 |             try:
252 |                 readerFooter = self.S(
253 |                     '.readerFooter_button,.readerFooter_ending')
254 |             except Exception:
255 |                 break
256 | 
257 |             readerFooterClass = readerFooter.get_attribute('class')
258 | 
259 |             # quick ending
260 |             if 'ending' in readerFooterClass:
261 |                 break
262 | 
263 |             next_btn_text = readerFooter.text.strip()
264 | 
265 |             if next_btn_text == "下一页":
266 |                 print("go to next page")
267 |                 page += 1
268 |             elif next_btn_text == "下一章":
269 |                 print("go to next chapter")
270 |                 page = 1
271 |             else:
272 |                 raise Exception("Unexpected Exception")
273 | 
274 |             # go to next page or chapter
275 |             readerFooter.click()
276 | 
277 |         print('pdf converting...')
278 | 
279 |         # convert to pdf and save offline
280 |         img2pdf(f'{save_at}/{book_name}', png_name_list,
281 |                 binary_threshold=binary_threshold, quality=quality)
282 |         print('scanning finished.')
283 |         if show_output:
284 |             os_start_file(f'{save_at}/{book_name}.pdf')
285 | 


--------------------------------------------------------------------------------