├── README.md ├── .gitignore ├── LICENSE └── src └── __init__.py /README.md: -------------------------------------------------------------------------------- 1 | ## calibre-douban 2 | Calibre douban metadata download plugin. 3 | Based on https://book.douban.com web pages. 4 | 5 | ### Calibre插件 6 | 7 | 最近在使用calibre-web管理电子书,不过很多时候还是需要用到Calibre桌面版软件,批量管理,编辑电子书等功能,在calibre-web上已经使用calibre-web-douban-api搜素豆瓣元数据,但是桌面版Calibre软件缺没有办法使用,不过calibre可以使用插件,而且是使用python开发,因此可以把calibre-web-douban-api改造一下包装成calibre插件,简单元数据插件还是比较容易的 8 | 9 | ### 安装方法 10 | 11 | 下载地址:[NewDouban.zip](https://github.com/fugary/calibre-douban/releases/latest/download/NewDouban.zip) 12 | 13 | 从release页面下载zip包,然后再calibre中安装为插件即可。 14 | 15 | 参考文档:https://fugary.com/?p=423 16 | 17 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | pip-wheel-metadata/ 24 | share/python-wheels/ 25 | *.egg-info/ 26 | .installed.cfg 27 | *.egg 28 | MANIFEST 29 | 30 | # PyInstaller 31 | # Usually these files are written by a python script from a template 32 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 33 | *.manifest 34 | *.spec 35 | 36 | # Installer logs 37 | pip-log.txt 38 | pip-delete-this-directory.txt 39 | 40 | # Unit test / coverage reports 41 | htmlcov/ 42 | .tox/ 43 | .nox/ 44 | .coverage 45 | .coverage.* 46 | .cache 47 | nosetests.xml 48 | coverage.xml 49 | *.cover 50 | *.py,cover 51 | .hypothesis/ 52 | .pytest_cache/ 53 | 54 | # Translations 55 | *.mo 56 | *.pot 57 | 58 | # Django stuff: 59 | *.log 60 | local_settings.py 61 | db.sqlite3 62 | db.sqlite3-journal 63 | 64 | # Flask stuff: 65 | instance/ 66 | .webassets-cache 67 | 68 | # Scrapy stuff: 69 | .scrapy 70 | 71 | # Sphinx documentation 72 | docs/_build/ 73 | 74 | # PyBuilder 75 | target/ 76 | 77 | # Jupyter Notebook 78 | .ipynb_checkpoints 79 | 80 | # IPython 81 | profile_default/ 82 | ipython_config.py 83 | 84 | # pyenv 85 | .python-version 86 | 87 | # pipenv 88 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 89 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 90 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 91 | # install all needed dependencies. 92 | #Pipfile.lock 93 | 94 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow 95 | __pypackages__/ 96 | 97 | # Celery stuff 98 | celerybeat-schedule 99 | celerybeat.pid 100 | 101 | # SageMath parsed files 102 | *.sage.py 103 | 104 | # Environments 105 | .env 106 | .venv 107 | env/ 108 | venv/ 109 | ENV/ 110 | env.bak/ 111 | venv.bak/ 112 | 113 | # Spyder project settings 114 | .spyderproject 115 | .spyproject 116 | 117 | # Rope project settings 118 | .ropeproject 119 | 120 | # mkdocs documentation 121 | /site 122 | 123 | # mypy 124 | .mypy_cache/ 125 | .dmypy.json 126 | dmypy.json 127 | 128 | # Pyre type checker 129 | .pyre/ 130 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "[]" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright [yyyy] [name of copyright owner] 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. 202 | -------------------------------------------------------------------------------- /src/__init__.py: -------------------------------------------------------------------------------- 1 | import re 2 | import time 3 | import random 4 | import gzip 5 | from concurrent.futures import ThreadPoolExecutor, as_completed 6 | from datetime import datetime 7 | from queue import Queue, Empty 8 | from urllib.parse import urlparse, unquote, urlencode 9 | from urllib.request import Request, urlopen 10 | 11 | from calibre import random_user_agent 12 | from calibre.ebooks.metadata import check_isbn 13 | from calibre.ebooks.metadata.book.base import Metadata 14 | from calibre.ebooks.metadata.sources.base import Source, Option 15 | from calibre.ebooks.BeautifulSoup import BeautifulSoup 16 | from bs4 import Tag 17 | 18 | DOUBAN_BOOK_BASE = "https://book.douban.com/" 19 | DOUBAN_SEARCH_JSON_URL = "https://www.douban.com/j/search" 20 | DOUBAN_SEARCH_URL = "https://www.douban.com/search" 21 | DOUBAN_BOOK_URL = 'https://book.douban.com/subject/%s/' 22 | DOUBAN_BOOK_CAT = "1001" 23 | DOUBAN_CONCURRENCY_SIZE = 5 # 并发查询数 24 | DOUBAN_BOOK_URL_PATTERN = re.compile(".*/subject/(\\d+)/?") 25 | PROVIDER_NAME = "New Douban Books" 26 | PROVIDER_ID = "new_douban" 27 | PROVIDER_VERSION = (2, 3, 0) 28 | PROVIDER_AUTHOR = 'Gary Fu' 29 | 30 | 31 | class DoubanBookSearcher: 32 | 33 | def __init__(self, max_workers, douban_delay_enable, douban_login_cookie): 34 | self.book_parser = DoubanBookHtmlParser() 35 | self.max_workers = max_workers 36 | self.thread_pool = ThreadPoolExecutor(max_workers=max_workers, thread_name_prefix='douban_async') 37 | self.douban_delay_enable = douban_delay_enable 38 | self.douban_login_cookie = douban_login_cookie 39 | 40 | def calc_url(self, href): 41 | query = urlparse(href).query 42 | params = {item.split('=')[0]: item.split('=')[1] for item in query.split('&')} 43 | url = unquote(params['url']) 44 | if DOUBAN_BOOK_URL_PATTERN.match(url): 45 | return url 46 | 47 | def load_book_urls_new(self, query, log): 48 | params = {"cat": DOUBAN_BOOK_CAT, "q": query} 49 | url = DOUBAN_SEARCH_URL + "?" + urlencode(params) 50 | log.info(f'Load books by search url: {url}') 51 | res = urlopen(Request(url, headers=self.get_headers(), method='GET')) 52 | book_urls = [] 53 | if res.status in [200, 201]: 54 | html_content = self.get_res_content(res) 55 | if self.is_prohibited(html_content, log): 56 | return book_urls 57 | html = BeautifulSoup(html_content) 58 | alist = html.select('a.nbg') 59 | for link in alist: 60 | href = link.get('href', '') 61 | parsed = self.calc_url(href) 62 | if parsed: 63 | if len(book_urls) < self.max_workers: 64 | book_urls.append(parsed) 65 | return book_urls 66 | 67 | def search_books(self, query, log): 68 | book_urls = self.load_book_urls_new(query, log) 69 | books = [] 70 | futures = [self.thread_pool.submit(self.load_book, book_url, log) for book_url in book_urls] 71 | for future in as_completed(futures): 72 | book = future.result() 73 | if self.is_valid_book(book): 74 | books.append(book) 75 | return books 76 | 77 | def load_book(self, url, log): 78 | book = None 79 | start_time = time.time() 80 | if self.douban_delay_enable: 81 | self.random_sleep(log) 82 | res = urlopen(Request(url, headers=self.get_headers(), method='GET')) 83 | if res.status in [200, 201]: 84 | book_detail_content = self.get_res_content(res) 85 | if self.is_prohibited(book_detail_content, log): 86 | return 87 | log.info("Downloaded:{} Successful,Time {:.0f}ms".format(url, (time.time() - start_time) * 1000)) 88 | try: 89 | book = self.book_parser.parse_book(url, book_detail_content) 90 | if not self.is_valid_book(book): 91 | log.info(f"Parse book content error: {book_detail_content}") 92 | except Exception as e: 93 | log.info(f"Parse book content error: {e} \n Content: {book_detail_content}") 94 | return book 95 | 96 | def is_valid_book(self, book): 97 | return book is not None and book.get('title', None) 98 | 99 | def is_prohibited(self, html_content, log): 100 | prohibited = html_content is not None and '禁止访问' in html_content 101 | if prohibited: 102 | html = BeautifulSoup(html_content) 103 | html_content = html.select_one('div#content') 104 | log.info(f'Douban网页访问失败:{html_content}') 105 | return prohibited 106 | 107 | def get_res_content(self, res): 108 | encoding = res.info().get('Content-Encoding') 109 | if encoding == 'gzip': 110 | res_content = gzip.decompress(res.read()) 111 | else: 112 | res_content = res.read() 113 | return res_content.decode(res.headers.get_content_charset()) 114 | 115 | def get_headers(self): 116 | headers = {'User-Agent': random_user_agent(), 'Accept-Encoding': 'gzip, deflate'} 117 | if self.douban_login_cookie: 118 | headers['Cookie'] = self.douban_login_cookie 119 | return headers 120 | 121 | def random_sleep(self, log): 122 | random_sec = random.random() / 10 123 | log.info("Random sleep time {}s".format(random_sec)) 124 | time.sleep(random_sec) 125 | 126 | 127 | class DoubanBookHtmlParser: 128 | def __init__(self): 129 | self.id_pattern = DOUBAN_BOOK_URL_PATTERN 130 | self.tag_pattern = re.compile("criteria = '(.+)'") 131 | 132 | def parse_book(self, url, book_content): 133 | book = {} 134 | html = BeautifulSoup(book_content) 135 | if html is None or html.select is None: # html判空处理 136 | return None 137 | title_element = html.select("span[property='v:itemreviewed']") 138 | book['title'] = self.get_text(title_element) 139 | share_element = html.select("a[data-url]") 140 | if len(share_element): 141 | url = share_element[0].get('data-url') 142 | book['url'] = url 143 | id_match = self.id_pattern.match(url) 144 | if id_match: 145 | book['id'] = id_match.group(1) 146 | img_element = html.select("a.nbg") 147 | if len(img_element): 148 | cover = img_element[0].get('href', '') 149 | if not cover or cover.endswith('update_image'): 150 | book['cover'] = '' 151 | else: 152 | book['cover'] = cover 153 | rating_element = html.select("strong[property='v:average']") 154 | book['rating'] = self.get_rating(rating_element) 155 | elements = html.select("span.pl") 156 | book['authors'] = [] 157 | book['translators'] = [] 158 | book['publisher'] = '' 159 | for element in elements: 160 | text = self.get_text(element) 161 | parent_ele = element.find_parent() 162 | if text.startswith("作者"): 163 | book['authors'].extend([self.get_text(author_element) for author_element in 164 | filter(self.author_filter, parent_ele.select("a"))]) 165 | elif text.startswith("译者"): 166 | book['translators'].extend([self.get_text(translator_element) for translator_element in 167 | filter(self.author_filter, parent_ele.select("a"))]) 168 | elif text.startswith("出版社"): 169 | book['publisher'] = self.get_tail(element) 170 | elif text.startswith("副标题"): 171 | book['title'] = book['title'] + ':' + self.get_tail(element) 172 | elif text.startswith("出版年"): 173 | book['publishedDate'] = self.get_tail(element) 174 | elif text.startswith("ISBN"): 175 | book['isbn'] = self.get_tail(element) 176 | elif text.startswith("丛书"): 177 | book['series'] = self.get_text(element.find_next_sibling()) 178 | summary_element = html.select("div#link-report div.intro") 179 | book['description'] = '' 180 | if len(summary_element): 181 | book['description'] = str(summary_element[-1]) 182 | book['tags'] = self.get_tags(book_content) 183 | book['source'] = { 184 | "id": PROVIDER_ID, 185 | "description": PROVIDER_NAME, 186 | "link": DOUBAN_BOOK_BASE 187 | } 188 | book['language'] = self.get_book_language(book['title']) 189 | return book 190 | 191 | def get_book_language(self, title): 192 | pattern = r'^[a-zA-Z\-_]+$' 193 | if title and ('英文版' in title or bool(re.match(pattern, title))): 194 | return 'en_US' 195 | return 'zh_CN' 196 | 197 | def get_tags(self, book_content): 198 | tag_match = self.tag_pattern.findall(book_content) 199 | if len(tag_match): 200 | return [tag.replace('7:', '') for tag in 201 | filter(lambda tag: tag and tag.startswith('7:'), tag_match[0].split('|'))] 202 | return [] 203 | 204 | def get_rating(self, rating_element): 205 | return float(self.get_text(rating_element, '0')) / 2 206 | 207 | def author_filter(self, a_element): 208 | a_href = a_element.get('href', '') 209 | return '/author' in a_href or '/search' in a_href 210 | 211 | def get_text(self, element, default_str=''): 212 | text = default_str 213 | if isinstance(element, Tag): 214 | text = element.get_text(strip=True) 215 | elif len(element) and isinstance(element[0], Tag): 216 | text = element[0].get_text(strip=True) 217 | return text if text else default_str 218 | 219 | def get_tail(self, element, default_str=''): 220 | text = default_str 221 | if isinstance(element, Tag) and element.next_siblings: 222 | for next_sibling in element.next_siblings: 223 | if isinstance(next_sibling, str): 224 | text += next_sibling.strip() 225 | elif isinstance(next_sibling, Tag): 226 | if not text: 227 | text = self.get_text(next_sibling, default_str) 228 | break 229 | return text if text else default_str 230 | 231 | 232 | class NewDoubanBooks(Source): 233 | name = 'New Douban Books' # Name of the plugin 234 | description = 'Downloads metadata and covers from Douban Books web site.' 235 | supported_platforms = ['windows', 'osx', 'linux'] # Platforms this plugin will run on 236 | author = PROVIDER_AUTHOR # The author of this plugin 237 | version = PROVIDER_VERSION # The version number of this plugin 238 | minimum_calibre_version = (5, 0, 0) 239 | capabilities = frozenset(['identify', 'cover']) 240 | touched_fields = frozenset([ 241 | 'title', 'authors', 'tags', 'pubdate', 'comments', 'publisher', 242 | 'identifier:isbn', 'rating', 'identifier:' + PROVIDER_ID 243 | ]) # language currently disabled 244 | book_searcher = None 245 | options = ( 246 | # name, type, default, label, default, choices 247 | # type 'number', 'string', 'bool', 'choices' 248 | Option( 249 | 'douban_concurrency_size', 'number', DOUBAN_CONCURRENCY_SIZE, 250 | _('Douban concurrency size:'), 251 | _('The number of douban concurrency cannot be too high!') 252 | ), 253 | Option( 254 | 'add_translator_to_author', 'bool', True, 255 | _('Add translator to author'), 256 | _('If selected, translator will be written to metadata as author') 257 | ), 258 | Option( 259 | 'douban_delay_enable', 'bool', True, 260 | _('douban random delay'), 261 | _('Random delay for a period of time before request') 262 | ), 263 | Option( 264 | 'douban_search_with_author', 'bool', True, 265 | _('search with authors'), 266 | _('add authors to search keywords') 267 | ), 268 | Option( 269 | 'douban_login_cookie', 'string', None, 270 | _('douban login cookie'), 271 | _('Browser cookie after login') 272 | ), 273 | ) 274 | 275 | def __init__(self, *args, **kwargs): 276 | Source.__init__(self, *args, **kwargs) 277 | concurrency_size = int(self.prefs.get('douban_concurrency_size')) 278 | douban_delay_enable = bool(self.prefs.get('douban_delay_enable')) 279 | douban_login_cookie = self.prefs.get('douban_login_cookie') 280 | self.douban_search_with_author = bool(self.prefs.get('douban_search_with_author')) 281 | self.book_searcher = DoubanBookSearcher(concurrency_size, douban_delay_enable, douban_login_cookie) 282 | 283 | def get_book_url(self, identifiers): # {{{ 284 | douban_id = identifiers.get(PROVIDER_ID, None) 285 | if douban_id is None: 286 | douban_id = identifiers.get('douban', None) 287 | if douban_id is not None: 288 | return PROVIDER_ID, douban_id, DOUBAN_BOOK_URL % douban_id 289 | 290 | def download_cover( 291 | self, 292 | log, 293 | result_queue, 294 | abort, 295 | title=None, 296 | authors=None, 297 | identifiers={}, 298 | timeout=30, 299 | get_best_cover=False): 300 | cached_url = self.get_cached_cover_url(identifiers) 301 | if cached_url is None: 302 | log.info('No cached cover found, running identify') 303 | rq = Queue() 304 | self.identify( 305 | log, 306 | rq, 307 | abort, 308 | title=title, 309 | authors=authors, 310 | identifiers=identifiers 311 | ) 312 | if abort.is_set(): 313 | return 314 | results = [] 315 | while True: 316 | try: 317 | results.append(rq.get_nowait()) 318 | except Empty: 319 | break 320 | results.sort( 321 | key=self.identify_results_keygen( 322 | title=title, authors=authors, identifiers=identifiers 323 | ) 324 | ) 325 | for mi in results: 326 | cached_url = self.get_cached_cover_url(mi.identifiers) 327 | if cached_url is not None: 328 | break 329 | if cached_url is None: 330 | log.info('No cover found') 331 | return 332 | br = self.browser 333 | log('Downloading cover from:', cached_url) 334 | try: 335 | if self.book_searcher.douban_login_cookie: 336 | br = br.clone_browser() 337 | br.set_current_header('Cookie', self.book_searcher.douban_login_cookie) 338 | br.set_current_header('Referer', DOUBAN_BOOK_BASE) 339 | cdata = br.open_novisit(cached_url, timeout=timeout).read() 340 | if cdata: 341 | result_queue.put((self, cdata)) 342 | except: 343 | log.exception('Failed to download cover from:', cached_url) 344 | 345 | def get_cached_cover_url(self, identifiers): # {{{ 346 | url = None 347 | db = identifiers.get(PROVIDER_ID, None) 348 | if db is None: 349 | isbn = identifiers.get('isbn', None) 350 | if isbn is not None: 351 | db = self.cached_isbn_to_identifier(isbn) 352 | if db is not None: 353 | url = self.cached_identifier_to_cover_url(db) 354 | 355 | return url 356 | 357 | def identify( 358 | self, 359 | log, 360 | result_queue, 361 | abort, 362 | title=None, 363 | authors=None, # {{{ 364 | identifiers={}, 365 | timeout=30): 366 | add_translator_to_author = self.prefs.get( 367 | 'add_translator_to_author') 368 | 369 | isbn = check_isbn(identifiers.get('isbn', None)) 370 | new_douban = self.get_book_url(identifiers) 371 | if new_douban: 372 | # 如果有new_douban的id,直接精确获取数据 373 | log.info(f'Load book by {PROVIDER_ID}:{new_douban[1]}') 374 | book = self.book_searcher.load_book(new_douban[2], log) 375 | books = [] 376 | if self.book_searcher.is_valid_book(book): 377 | books.append(book) 378 | else: 379 | search_keyword = title 380 | if self.douban_search_with_author and title and authors: 381 | authors_str = ','.join(authors) 382 | search_keyword = f'{title} {authors_str}' 383 | books = self.book_searcher.search_books(isbn or search_keyword, log) 384 | if not len(books) and title and (isbn or search_keyword != title): 385 | books = self.book_searcher.search_books(title, log) # 用isbn或者title+auther没有数据,用title重新搜一遍 386 | for book in books: 387 | ans = self.to_metadata(book, add_translator_to_author, log) 388 | if isinstance(ans, Metadata): 389 | db = ans.identifiers[PROVIDER_ID] 390 | if ans.isbn: 391 | self.cache_isbn_to_identifier(ans.isbn, db) 392 | if ans.cover: 393 | self.cache_identifier_to_cover_url(db, ans.cover) 394 | self.clean_downloaded_metadata(ans) 395 | result_queue.put(ans) 396 | 397 | def to_metadata(self, book, add_translator_to_author, log): 398 | if book: 399 | authors = (book['authors'] + book['translators'] 400 | ) if add_translator_to_author else book['authors'] 401 | mi = Metadata(book['title'], authors) 402 | mi.identifiers = {PROVIDER_ID: book['id']} 403 | mi.url = book['url'] 404 | mi.cover = book.get('cover', None) 405 | mi.publisher = book['publisher'] 406 | pubdate = book.get('publishedDate', None) 407 | if pubdate: 408 | try: 409 | if re.compile('^\\d{4}-\\d+$').match(pubdate): 410 | mi.pubdate = datetime.strptime(pubdate, '%Y-%m') 411 | elif re.compile('^\\d{4}-\\d+-\\d+$').match(pubdate): 412 | mi.pubdate = datetime.strptime(pubdate, '%Y-%m-%d') 413 | except: 414 | log.error('Failed to parse pubdate %r' % pubdate) 415 | mi.comments = book['description'] 416 | mi.tags = book.get('tags', []) 417 | mi.rating = book['rating'] 418 | mi.isbn = book.get('isbn', '') 419 | mi.series = book.get('series', []) 420 | mi.language = book.get('language', 'zh_CN') 421 | log.info('parsed book', book) 422 | return mi 423 | 424 | 425 | if __name__ == "__main__": 426 | # To run these test use: calibre-debug -e ./__init__.py 427 | from calibre.ebooks.metadata.sources.test import ( 428 | test_identify_plugin, title_test, authors_test 429 | ) 430 | 431 | test_identify_plugin( 432 | NewDoubanBooks.name, [ 433 | ({ 434 | 'identifiers': { 435 | 'isbn': '9787111544937' 436 | }, 437 | 'title': '深入理解计算机系统(原书第3版)' 438 | }, [title_test('深入理解计算机系统(原书第3版)', exact=True), 439 | authors_test(['randal e.bryant', "david o'hallaron", '贺莲', '龚奕利'])]), 440 | ({ 441 | 'title': '凤凰架构' 442 | }, [title_test('凤凰架构:构建可靠的大型分布式系统', exact=True), 443 | authors_test(['周志明'])]) 444 | ] 445 | ) 446 | --------------------------------------------------------------------------------