├── LICENSE ├── README.md ├── crawl_wqxt.py ├── imgautocompress.py └── requirements.txt /LICENSE: -------------------------------------------------------------------------------- 1 | GNU LESSER GENERAL PUBLIC LICENSE 2 | Version 3, 29 June 2007 3 | 4 | Copyright (C) 2007 Free Software Foundation, Inc. 5 | Everyone is permitted to copy and distribute verbatim copies 6 | of this license document, but changing it is not allowed. 7 | 8 | 9 | This version of the GNU Lesser General Public License incorporates 10 | the terms and conditions of version 3 of the GNU General Public 11 | License, supplemented by the additional permissions listed below. 12 | 13 | 0. Additional Definitions. 14 | 15 | As used herein, "this License" refers to version 3 of the GNU Lesser 16 | General Public License, and the "GNU GPL" refers to version 3 of the GNU 17 | General Public License. 18 | 19 | "The Library" refers to a covered work governed by this License, 20 | other than an Application or a Combined Work as defined below. 21 | 22 | An "Application" is any work that makes use of an interface provided 23 | by the Library, but which is not otherwise based on the Library. 24 | Defining a subclass of a class defined by the Library is deemed a mode 25 | of using an interface provided by the Library. 26 | 27 | A "Combined Work" is a work produced by combining or linking an 28 | Application with the Library. The particular version of the Library 29 | with which the Combined Work was made is also called the "Linked 30 | Version". 31 | 32 | The "Minimal Corresponding Source" for a Combined Work means the 33 | Corresponding Source for the Combined Work, excluding any source code 34 | for portions of the Combined Work that, considered in isolation, are 35 | based on the Application, and not on the Linked Version. 36 | 37 | The "Corresponding Application Code" for a Combined Work means the 38 | object code and/or source code for the Application, including any data 39 | and utility programs needed for reproducing the Combined Work from the 40 | Application, but excluding the System Libraries of the Combined Work. 41 | 42 | 1. Exception to Section 3 of the GNU GPL. 43 | 44 | You may convey a covered work under sections 3 and 4 of this License 45 | without being bound by section 3 of the GNU GPL. 46 | 47 | 2. Conveying Modified Versions. 48 | 49 | If you modify a copy of the Library, and, in your modifications, a 50 | facility refers to a function or data to be supplied by an Application 51 | that uses the facility (other than as an argument passed when the 52 | facility is invoked), then you may convey a copy of the modified 53 | version: 54 | 55 | a) under this License, provided that you make a good faith effort to 56 | ensure that, in the event an Application does not supply the 57 | function or data, the facility still operates, and performs 58 | whatever part of its purpose remains meaningful, or 59 | 60 | b) under the GNU GPL, with none of the additional permissions of 61 | this License applicable to that copy. 62 | 63 | 3. Object Code Incorporating Material from Library Header Files. 64 | 65 | The object code form of an Application may incorporate material from 66 | a header file that is part of the Library. You may convey such object 67 | code under terms of your choice, provided that, if the incorporated 68 | material is not limited to numerical parameters, data structure 69 | layouts and accessors, or small macros, inline functions and templates 70 | (ten or fewer lines in length), you do both of the following: 71 | 72 | a) Give prominent notice with each copy of the object code that the 73 | Library is used in it and that the Library and its use are 74 | covered by this License. 75 | 76 | b) Accompany the object code with a copy of the GNU GPL and this license 77 | document. 78 | 79 | 4. Combined Works. 80 | 81 | You may convey a Combined Work under terms of your choice that, 82 | taken together, effectively do not restrict modification of the 83 | portions of the Library contained in the Combined Work and reverse 84 | engineering for debugging such modifications, if you also do each of 85 | the following: 86 | 87 | a) Give prominent notice with each copy of the Combined Work that 88 | the Library is used in it and that the Library and its use are 89 | covered by this License. 90 | 91 | b) Accompany the Combined Work with a copy of the GNU GPL and this license 92 | document. 93 | 94 | c) For a Combined Work that displays copyright notices during 95 | execution, include the copyright notice for the Library among 96 | these notices, as well as a reference directing the user to the 97 | copies of the GNU GPL and this license document. 98 | 99 | d) Do one of the following: 100 | 101 | 0) Convey the Minimal Corresponding Source under the terms of this 102 | License, and the Corresponding Application Code in a form 103 | suitable for, and under terms that permit, the user to 104 | recombine or relink the Application with a modified version of 105 | the Linked Version to produce a modified Combined Work, in the 106 | manner specified by section 6 of the GNU GPL for conveying 107 | Corresponding Source. 108 | 109 | 1) Use a suitable shared library mechanism for linking with the 110 | Library. A suitable mechanism is one that (a) uses at run time 111 | a copy of the Library already present on the user's computer 112 | system, and (b) will operate properly with a modified version 113 | of the Library that is interface-compatible with the Linked 114 | Version. 115 | 116 | e) Provide Installation Information, but only if you would otherwise 117 | be required to provide such information under section 6 of the 118 | GNU GPL, and only to the extent that such information is 119 | necessary to install and execute a modified version of the 120 | Combined Work produced by recombining or relinking the 121 | Application with a modified version of the Linked Version. (If 122 | you use option 4d0, the Installation Information must accompany 123 | the Minimal Corresponding Source and Corresponding Application 124 | Code. If you use option 4d1, you must provide the Installation 125 | Information in the manner specified by section 6 of the GNU GPL 126 | for conveying Corresponding Source.) 127 | 128 | 5. Combined Libraries. 129 | 130 | You may place library facilities that are a work based on the 131 | Library side by side in a single library together with other library 132 | facilities that are not Applications and are not covered by this 133 | License, and convey such a combined library under terms of your 134 | choice, if you do both of the following: 135 | 136 | a) Accompany the combined library with a copy of the same work based 137 | on the Library, uncombined with any other library facilities, 138 | conveyed under the terms of this License. 139 | 140 | b) Give prominent notice with the combined library that part of it 141 | is a work based on the Library, and explaining where to find the 142 | accompanying uncombined form of the same work. 143 | 144 | 6. Revised Versions of the GNU Lesser General Public License. 145 | 146 | The Free Software Foundation may publish revised and/or new versions 147 | of the GNU Lesser General Public License from time to time. Such new 148 | versions will be similar in spirit to the present version, but may 149 | differ in detail to address new problems or concerns. 150 | 151 | Each version is given a distinguishing version number. If the 152 | Library as you received it specifies that a certain numbered version 153 | of the GNU Lesser General Public License "or any later version" 154 | applies to it, you have the option of following the terms and 155 | conditions either of that published version or of any later version 156 | published by the Free Software Foundation. If the Library as you 157 | received it does not specify a version number of the GNU Lesser 158 | General Public License, you may choose any version of the GNU Lesser 159 | General Public License ever published by the Free Software Foundation. 160 | 161 | If the Library as you received it specifies that a proxy can decide 162 | whether future versions of the GNU Lesser General Public License shall 163 | apply, that proxy's public statement of acceptance of any version is 164 | permanent authorization for you to choose that version for the 165 | Library. 166 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | “文泉学堂”PDF下载 2 | ==================== 3 | 4 | [文泉学堂](https://lib-nuanxin.wqxuetang.com/) 5 | 6 | 1. 安装 requirements.txt 里的依赖 7 | 2. 找到你要的书,看地址栏的数字为 id 8 | 3. 运行 `python3 crawl_wqxt.py ` 9 | 10 | 服务器生成图片需要时间,可能出现 not loaded,会稍候重试。若一直出现 not loaded(第二遍还是),请尝试重新运行,已下载的图片不会重新下载。 11 | 12 | 若需要清理缓存,请删除 wqxt.db 或自行更改其内容(SQLite 数据库)。 13 | 14 | 若需要登录,请自行在 `crawl_wqxt.py` 的 HEADERS(36行)里加 Cookie 等内容。 15 | 16 | 请合理使用服务器资源。版权问题概不负责。 17 | 18 | imgautocompress.py 会对下载的图片判断是否为灰度、是否为黑白,并转成相应格式,减少图片大小。 19 | 20 | 要类似地减少其他扫描版 PDF 文件大小,可以使用 [pdfreduce](https://github.com/gumblex/pdfreduce)。要添加 OCR 层,可使用 [ocrmypdf](https://github.com/jbarlow83/OCRmyPDF)。 21 | -------------------------------------------------------------------------------- /crawl_wqxt.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | 4 | import os 5 | import re 6 | import sys 7 | import time 8 | import json 9 | import sqlite3 10 | import logging 11 | import collections 12 | 13 | import jwt 14 | import img2pdf 15 | import imgautocompress 16 | 17 | try: 18 | from httpx import Client as Session 19 | except ImportError: 20 | from requests import Session 21 | 22 | WITH_PDFRW = True 23 | 24 | if WITH_PDFRW: 25 | try: 26 | from pdfrw import PdfDict, PdfName 27 | except ImportError: 28 | PdfDict = img2pdf.MyPdfDict 29 | PdfName = img2pdf.MyPdfName 30 | WITH_PDFRW = False 31 | else: 32 | PdfDict = img2pdf.MyPdfDict 33 | PdfName = img2pdf.MyPdfName 34 | 35 | 36 | HEADERS = { 37 | "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8", 38 | "Accept-Encoding": "gzip, deflate", 39 | "Accept-Language": "zh-CN,zh;q=0.8,en;q=0.6", 40 | "User-Agent": "Mozilla/5.0 (Windows NT 6.1; rv:60.0) Gecko/20100101 Firefox/60.0", 41 | } 42 | 43 | re_author = re.compile(r'《.+?》\s*(.+?)\s*【') 44 | 45 | logging.basicConfig(stream=sys.stderr, format='%(asctime)s [%(levelname)s] %(message)s', level=logging.INFO) 46 | 47 | 48 | class APIError(ValueError): 49 | pass 50 | 51 | 52 | class TryAgain(ValueError): 53 | pass 54 | 55 | 56 | def generate_pdf_outline(pdf, contents, parent=None): 57 | if parent is None: 58 | parent = PdfDict(indirect=True) 59 | if not contents: 60 | return parent 61 | first = prev = None 62 | for k, row in enumerate(contents): 63 | try: 64 | page = pdf.writer.pagearray[int(row['pnum'])-1] 65 | except IndexError: 66 | # bad bookmark 67 | continue 68 | bookmark = PdfDict( 69 | Parent=parent, 70 | Title=row['label'], 71 | A=PdfDict( 72 | D=[page, PdfName.Fit], 73 | S=PdfName.GoTo 74 | ), 75 | indirect=True 76 | ) 77 | children = row.get('children') 78 | if children: 79 | bookmark = generate_pdf_outline(pdf, children, bookmark) 80 | if first: 81 | bookmark[PdfName.Prev] = prev 82 | prev[PdfName.Next] = bookmark 83 | else: 84 | first = bookmark 85 | prev = bookmark 86 | parent[PdfName.Count] = k + 1 87 | parent[PdfName.First] = first 88 | parent[PdfName.Last] = prev 89 | return parent 90 | 91 | 92 | def pdf_convert(*images, **kwargs): 93 | 94 | _default_kwargs = dict( 95 | title=None, 96 | author=None, 97 | creator=None, 98 | producer=None, 99 | creationdate=None, 100 | moddate=None, 101 | subject=None, 102 | keywords=None, 103 | colorspace=None, 104 | contents=None, 105 | nodate=False, 106 | layout_fun=img2pdf.default_layout_fun, 107 | viewer_panes=None, 108 | viewer_initial_page=None, 109 | viewer_magnification=None, 110 | viewer_page_layout=None, 111 | viewer_fit_window=False, 112 | viewer_center_window=False, 113 | viewer_fullscreen=False, 114 | with_pdfrw=True, 115 | first_frame_only=False, 116 | allow_oversized=True, 117 | ) 118 | for kwname, default in _default_kwargs.items(): 119 | if kwname not in kwargs: 120 | kwargs[kwname] = default 121 | 122 | pdf = img2pdf.pdfdoc( 123 | "1.3", 124 | kwargs["title"], 125 | kwargs["author"], 126 | kwargs["creator"], 127 | kwargs["producer"], 128 | kwargs["creationdate"], 129 | kwargs["moddate"], 130 | kwargs["subject"], 131 | kwargs["keywords"], 132 | kwargs["nodate"], 133 | kwargs["viewer_panes"], 134 | kwargs["viewer_initial_page"], 135 | kwargs["viewer_magnification"], 136 | kwargs["viewer_page_layout"], 137 | kwargs["viewer_fit_window"], 138 | kwargs["viewer_center_window"], 139 | kwargs["viewer_fullscreen"], 140 | kwargs["with_pdfrw"], 141 | ) 142 | 143 | # backwards compatibility with older img2pdf versions where the first 144 | # argument to the function had to be given as a list 145 | if len(images) == 1: 146 | # if only one argument was given and it is a list, expand it 147 | if isinstance(images[0], (list, tuple)): 148 | images = images[0] 149 | 150 | if not isinstance(images, (list, tuple)): 151 | images = [images] 152 | 153 | for img in images: 154 | # img is allowed to be a path, a binary string representing image data 155 | # or a file-like object (really anything that implements read()) 156 | try: 157 | rawdata = img.read() 158 | except AttributeError: 159 | if not isinstance(img, (str, bytes)): 160 | raise TypeError("Neither implements read() nor is str or bytes") 161 | # the thing doesn't have a read() function, so try if we can treat 162 | # it as a file name 163 | try: 164 | with open(img, "rb") as f: 165 | rawdata = f.read() 166 | except Exception: 167 | # whatever the exception is (string could contain NUL 168 | # characters or the path could just not exist) it's not a file 169 | # name so we now try treating it as raw image content 170 | rawdata = img 171 | 172 | for ( 173 | color, 174 | ndpi, 175 | imgformat, 176 | imgdata, 177 | imgwidthpx, 178 | imgheightpx, 179 | palette, 180 | inverted, 181 | depth, 182 | rotation, 183 | ) in img2pdf.read_images(rawdata, kwargs["colorspace"], kwargs["first_frame_only"]): 184 | pagewidth, pageheight, imgwidthpdf, imgheightpdf = kwargs["layout_fun"]( 185 | imgwidthpx, imgheightpx, ndpi 186 | ) 187 | 188 | userunit = None 189 | if pagewidth < 3.00 or pageheight < 3.00: 190 | logging.warning( 191 | "pdf width or height is below 3.00 - too " "small for some viewers!" 192 | ) 193 | elif pagewidth > 14400.0 or pageheight > 14400.0: 194 | if kwargs["allow_oversized"]: 195 | userunit = img2pdf.find_scale(pagewidth, pageheight) 196 | pagewidth /= userunit 197 | pageheight /= userunit 198 | imgwidthpdf /= userunit 199 | imgheightpdf /= userunit 200 | else: 201 | raise img2pdf.PdfTooLargeError( 202 | "pdf width or height must not exceed 200 inches." 203 | ) 204 | # the image is always centered on the page 205 | imgxpdf = (pagewidth - imgwidthpdf) / 2.0 206 | imgypdf = (pageheight - imgheightpdf) / 2.0 207 | pdf.add_imagepage( 208 | color, 209 | imgwidthpx, 210 | imgheightpx, 211 | imgformat, 212 | imgdata, 213 | imgwidthpdf, 214 | imgheightpdf, 215 | imgxpdf, 216 | imgypdf, 217 | pagewidth, 218 | pageheight, 219 | userunit, 220 | palette, 221 | inverted, 222 | depth, 223 | rotation, 224 | ) 225 | 226 | if kwargs['contents']: 227 | if pdf.with_pdfrw: 228 | catalog = pdf.writer.trailer.Root 229 | else: 230 | catalog = pdf.writer.catalog 231 | catalog[PdfName.Outlines] = generate_pdf_outline(pdf, kwargs['contents']) 232 | 233 | if kwargs["outputstream"]: 234 | pdf.tostream(kwargs["outputstream"]) 235 | return 236 | 237 | return pdf.tostring() 238 | 239 | 240 | class WQXTDownloader: 241 | baseurl = 'https://lib-nuanxin.wqxuetang.com/read/pdf/' 242 | jwt_secret = "g0NnWdSE8qEjdMD8a1aq12qEYphwErKctvfd3IktWHWiOBpVsgkecur38aBRPn2w" 243 | loading_img = '3f08d2c4b0d8cac7641730c7f27f7263c8687bc67cdf179de6996edb9d8409bf09664e035b56d72c00d0b46d8dca1868a48290f469064efd5ba611958fe614e1' 244 | 245 | def __init__(self, downloadpath='.', db='wqxt.db'): 246 | self.downloadpath = downloadpath 247 | self.db = sqlite3.connect(db) 248 | self.session = Session() 249 | self.session.headers.update(HEADERS) 250 | self.init_db() 251 | 252 | def init_db(self): 253 | cur = self.db.cursor() 254 | cur.execute('PRAGMA case_sensitive_like=1') 255 | cur.execute('CREATE TABLE IF NOT EXISTS api_cache (' 256 | 'url TEXT PRIMARY KEY,' 257 | 'updated INTEGER,' 258 | 'value TEXT' 259 | ')') 260 | cur.execute('CREATE TABLE IF NOT EXISTS book_img (' 261 | 'bookid INTEGER,' 262 | 'page INTEGER,' 263 | 'updated INTEGER,' 264 | 'data BLOB,' 265 | 'PRIMARY KEY (bookid, page)' 266 | ')') 267 | self.db.commit() 268 | 269 | def json_call(self, bookid, url, cache=True): 270 | cur = self.db.cursor() 271 | url = url % bookid 272 | if cache: 273 | cur.execute('SELECT value FROM api_cache WHERE url=?', (url,)) 274 | res = cur.fetchone() 275 | if res: 276 | return json.loads(res[0]) 277 | r = self.session.get(url, headers={ 278 | 'referer': self.baseurl + str(bookid), 279 | 'sec-fetch-mode': 'cors', 280 | 'sec-fetch-site': 'same-origin', 281 | 'user': 'bapkg/com.bookask.wqxuetang baver/1.1.1', 282 | }) 283 | r.raise_for_status() 284 | result = r.json() 285 | if result['errcode']: 286 | name = url.rsplit('/', 1)[-1] 287 | raise APIError('%s [%s]: %s', name, result['errcode'], result['errmsg']) 288 | cur.execute('REPLACE INTO api_cache VALUES (?,?,?)', ( 289 | url, int(time.time()), json.dumps(result['data']))) 290 | self.db.commit() 291 | return result['data'] 292 | 293 | def get_img(self, bookid, page, jwtkey): 294 | cur = self.db.cursor() 295 | cur.execute('SELECT data FROM book_img WHERE bookid=? AND page=?', 296 | (bookid, page)) 297 | res = cur.fetchone() 298 | if res: 299 | return res[0] 300 | cur_time = time.time() 301 | jwttoken = jwt.encode({ 302 | "p": page, 303 | "t": int(cur_time*1000), 304 | "b": str(bookid), 305 | "w": 1000, 306 | "k": json.dumps(jwtkey), 307 | "iat": int(cur_time) 308 | }, self.jwt_secret, algorithm='HS256').decode('ascii') 309 | r = self.session.get( 310 | "https://lib-nuanxin.wqxuetang.com/page/img/%s/%s?k=%s" % ( 311 | bookid, page, jwttoken), headers={ 312 | 'referer': self.baseurl + str(bookid), 313 | 'sec-fetch-mode': 'no-cors', 314 | 'sec-fetch-site': 'same-origin', 315 | }) 316 | r.raise_for_status() 317 | result = r.content 318 | if r.headers.get('pragma') != 'catch': 319 | raise TryAgain() 320 | cur.execute('REPLACE INTO book_img VALUES (?,?,?,?)', ( 321 | bookid, page, int(cur_time), result)) 322 | self.db.commit() 323 | return result 324 | 325 | def download_pdf(self, bookid, convertimg=True): 326 | logging.info('%s: Loading metadata', bookid) 327 | r = self.session.get(self.baseurl + str(bookid)) 328 | r.raise_for_status() 329 | metadata = self.json_call(bookid, "https://lib-nuanxin.wqxuetang.com/v1/read/initread?bid=%s") 330 | title = metadata['name'] 331 | try: 332 | author = re_author.match(metadata['title']).group(1) 333 | except Exception: 334 | author = None 335 | contents = self.json_call(bookid, "https://lib-nuanxin.wqxuetang.com/v1/book/catatree?bid=%s") 336 | sizes = self.json_call(bookid, "https://lib-nuanxin.wqxuetang.com/page/size/?bid=%s") 337 | jwtkey = self.json_call(bookid, "https://lib-nuanxin.wqxuetang.com/v1/read/k?bid=%s", cache=False) 338 | page_num = int(metadata['pages']) 339 | images = [None] * page_num 340 | tasks = collections.deque(range(1, page_num+1)) 341 | while tasks: 342 | i = tasks.popleft() 343 | try: 344 | img = self.get_img(bookid, i, jwtkey) 345 | if convertimg: 346 | img, imgfmt = imgautocompress.auto_encode(img) 347 | images[i-1] = img 348 | logging.info('%s: %s/%s', bookid, i, page_num) 349 | except TryAgain: 350 | tasks.append(i) 351 | logging.info('%s: %s/%s not loaded', bookid, i, page_num) 352 | time.sleep(0.5) 353 | except Exception: 354 | tasks.append(i) 355 | logging.exception('%s: %s/%s', bookid, i, page_num) 356 | time.sleep(1) 357 | logging.info('%s: Generating PDF', bookid) 358 | filename = "%s-%s.pdf" % ( 359 | bookid, title.replace('/', '_').replace(':', ':')) 360 | with open(filename, "wb") as f: 361 | pdf_convert( 362 | images, 363 | title=metadata['name'], 364 | author=author, 365 | with_pdfrw=True, 366 | contents=contents, 367 | outputstream=f 368 | ) 369 | 370 | if __name__ == '__main__': 371 | # usage: python3 crawl_wqxt.py 372 | dl = WQXTDownloader() 373 | dl.download_pdf(int(sys.argv[1])) 374 | -------------------------------------------------------------------------------- /imgautocompress.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | 4 | import os 5 | import io 6 | import sys 7 | import math 8 | import numpy as np 9 | from PIL import Image, ImageStat, ImageFilter 10 | 11 | _PIXWEIGHT = np.concatenate((np.arange(128, 0, -1), np.arange(0, 128))) / 128 12 | 13 | 14 | def otsu_threshold(hist): 15 | total = sum(hist) 16 | sumB = 0 17 | wB = 0 18 | maximum = 0.0 19 | sum1 = np.dot(np.arange(256), hist) 20 | for i in range(256): 21 | wB += hist[i] 22 | wF = total - wB 23 | if wB == 0 or wF == 0: 24 | continue 25 | sumB += i * hist[i] 26 | mF = (sum1 - sumB) / wF 27 | between = wB * wF * ((sumB / wB) - mF) * ((sumB / wB) - mF) 28 | if between >= maximum: 29 | level = i + 1 30 | maximum = between 31 | return level 32 | 33 | 34 | def auto_downgrade(pil_img, thumb_size=128, grey_cutoff=1, bw_ratio=0.99, bw_supersample=1): 35 | mode = pil_img.mode 36 | if mode == '1' and mode not in ('L', 'LA', 'RGB', 'RGBA'): 37 | # ignore special modes 38 | return pil_img 39 | elif mode == 'P': 40 | pil_img = pil_img.convert('RGB') 41 | elif mode == 'PA': 42 | pil_img = pil_img.convert('RGBA') 43 | bands = pil_img.getbands() 44 | alpha_band = False 45 | if bands[-1] == 'A': 46 | alpha_band = True 47 | if all(x == 255 for x in pil_img.getdata(len(bands) - 1)): 48 | alpha_band = False 49 | if bands[:3] == ('R', 'G', 'B'): 50 | thumb = pil_img.resize((thumb_size,thumb_size), resample=Image.BILINEAR) 51 | pixels = np.array(thumb.getdata(), dtype=float)[:, :3] 52 | pixels_max = np.max(pixels, axis=1) 53 | pixels_min = np.min(pixels, axis=1) 54 | val = np.mean(pixels_max - pixels_min) 55 | if val > grey_cutoff: 56 | if bands[-1] == 'A' and not alpha_band: 57 | return pil_img.convert('RGB') 58 | else: 59 | return pil_img 60 | if alpha_band: 61 | return pil_img.convert('LA') 62 | else: 63 | pil_img = pil_img.convert('L') 64 | if alpha_band: 65 | return pil_img 66 | hist = pil_img.histogram()[:256] 67 | if np.average(_PIXWEIGHT, weights=hist) > bw_ratio: 68 | if bw_supersample != 1: 69 | width, height = pil_img.size 70 | width = round(width * bw_supersample) 71 | height = round(height * bw_supersample) 72 | scaled = pil_img.resize((width, height), resample=Image.BICUBIC) 73 | else: 74 | scaled = pil_img 75 | threshold = otsu_threshold(hist) 76 | if 50 < threshold < 250: # resonable range 77 | scaled = scaled.point(lambda p: p > threshold and 255) 78 | return scaled.convert('1', dither=Image.NONE) 79 | if bands[-1] == 'A': 80 | return pil_img.convert('L') 81 | return pil_img 82 | 83 | 84 | def auto_encode(fp, quality=95, thumb_size=128, grey_cutoff=1, bw_ratio=0.99, bw_supersample=1): 85 | if isinstance(fp, str): 86 | with open(fp, 'rb') as f: 87 | orig_data = f.read() 88 | elif isinstance(fp, bytes): 89 | orig_data = fp 90 | else: 91 | orig_data = fp.read() 92 | orig_buf = io.BytesIO(orig_data) 93 | orig_size = len(orig_data) 94 | im = Image.open(orig_buf) 95 | out_im = auto_downgrade(im, thumb_size, grey_cutoff, bw_ratio) 96 | buf = io.BytesIO() 97 | if out_im.mode == '1': 98 | out_im.save(buf, 'TIFF', compression='group4') 99 | return buf.getvalue(), 'TIFF' 100 | elif out_im.mode[0] == 'L' or out_im.mode[-1] == 'A': 101 | out_im.save(buf, 'PNG', optimize=True) 102 | return buf.getvalue(), 'PNG' 103 | if im.format.startswith('JPEG'): 104 | out_format = 'PNG' 105 | out_im.save(buf, 'PNG', optimize=True) 106 | else: 107 | out_format = 'JPEG' 108 | out_im.convert('RGB').save(buf, 'JPEG', quality=quality, optimize=True) 109 | out_data = buf.getvalue() 110 | if len(out_data) > orig_size: 111 | if out_im.mode == im.mode: 112 | return orig_data, im.format 113 | else: 114 | buf = io.BytesIO() 115 | out_im.save(buf, 'PNG', optimize=True) 116 | return buf.getvalue(), 'PNG' 117 | else: 118 | return out_data, out_format 119 | 120 | 121 | if __name__ == '__main__': 122 | input_file = sys.argv[1] 123 | output_prefix = sys.argv[2] 124 | output_data, output_format = auto_encode(input_file) 125 | if output_format == 'JPEG': 126 | output_name = output_prefix + '.jpg' 127 | else: 128 | output_name = output_prefix + '.' + output_format.lower() 129 | with open(output_name, 'wb') as f: 130 | f.write(output_data) 131 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | httpx 2 | Pillow 3 | numpy 4 | img2pdf 5 | pyjwt 6 | pdfrw 7 | --------------------------------------------------------------------------------