├── _config.yml ├── screenshots ├── 01.png ├── 02.png ├── 03.png ├── 04.png └── 05.png ├── tests ├── __init__.py ├── test_zcool.py └── test_multi_thread.py ├── requirements.txt ├── zcool.py ├── cnu.py ├── scraper ├── __init__.py ├── utils.py ├── cnu.py └── zcool.py ├── Pipfile ├── LICENSE ├── .gitignore ├── README.md └── Pipfile.lock /_config.yml: -------------------------------------------------------------------------------- 1 | theme: jekyll-theme-minimal -------------------------------------------------------------------------------- /screenshots/01.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lonsty/scraper/HEAD/screenshots/01.png -------------------------------------------------------------------------------- /screenshots/02.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lonsty/scraper/HEAD/screenshots/02.png -------------------------------------------------------------------------------- /screenshots/03.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lonsty/scraper/HEAD/screenshots/03.png -------------------------------------------------------------------------------- /screenshots/04.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lonsty/scraper/HEAD/screenshots/04.png -------------------------------------------------------------------------------- /screenshots/05.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lonsty/scraper/HEAD/screenshots/05.png -------------------------------------------------------------------------------- /tests/__init__.py: -------------------------------------------------------------------------------- 1 | # @FILENAME : __init__.py 2 | # @AUTHOR : lonsty 3 | # @DATE : 2019/9/9 8:46 4 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | aiofiles>=0.6.0 2 | beautifulsoup4>=4.9.1 3 | click>=7.0 4 | requests>=2.24.0 5 | ruia[uvloop]>=0.8.0 6 | termcolor>=1.1.0 7 | typer>=0.3.2 8 | -------------------------------------------------------------------------------- /zcool.py: -------------------------------------------------------------------------------- 1 | # @FILENAME : zcool 2 | # @AUTHOR : lonsty 3 | # @DATE : 2019/9/9 11:19 4 | import sys 5 | 6 | from scraper import zcool_command 7 | 8 | if __name__ == '__main__': 9 | sys.exit(zcool_command()) 10 | -------------------------------------------------------------------------------- /cnu.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # @Author: eilianxiao 3 | # @Date: Dec 27 03:59 2020 4 | import typer 5 | 6 | from scraper.cnu import cnu_command 7 | 8 | if __name__ == '__main__': 9 | typer.run(cnu_command) 10 | -------------------------------------------------------------------------------- /scraper/__init__.py: -------------------------------------------------------------------------------- 1 | # @FILENAME : __init__.py 2 | # @AUTHOR : lonsty 3 | # @DATE : 2019/9/9 11:04 4 | from .zcool import ZCoolScraper, zcool_command 5 | 6 | __author__ = 'lonsty' 7 | __email__ = 'lonsty@sina.com' 8 | __version__ = '0.1.4' 9 | 10 | __all__ = [ 11 | 'ZCoolScraper', 12 | 'zcool_command' 13 | ] 14 | -------------------------------------------------------------------------------- /Pipfile: -------------------------------------------------------------------------------- 1 | [[source]] 2 | name = "pypi" 3 | url = "https://pypi.org/simple" 4 | verify_ssl = true 5 | 6 | [dev-packages] 7 | 8 | [packages] 9 | aiofiles = ">=0.6.0" 10 | beautifulsoup4 = ">=4.9.1" 11 | click = ">=7.0" 12 | requests = ">=2.24.0" 13 | ruia = {version = ">=0.8.0", extras = ["uvloop"]} 14 | termcolor = ">=1.1.0" 15 | typer = ">=0.3.2" 16 | 17 | [requires] 18 | python_version = "3.6" 19 | -------------------------------------------------------------------------------- /tests/test_zcool.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | """Tests for `zcooldl` package.""" 3 | import unittest 4 | 5 | from click.testing import CliRunner 6 | 7 | from scraper.zcool import zcool_command 8 | 9 | 10 | class TestZcooldl(unittest.TestCase): 11 | """Tests for `zcooldl` package.""" 12 | 13 | def setUp(self): 14 | """Set up test fixtures, if any.""" 15 | 16 | def tearDown(self): 17 | """Tear down test fixtures, if any.""" 18 | 19 | def test_000_something(self): 20 | """Test something.""" 21 | 22 | def test_command_line_interface(self): 23 | """Test the CLI.""" 24 | runner = CliRunner() 25 | result = runner.invoke(zcool_command) 26 | assert result.exit_code == 0 27 | assert 'Try "python zcool.py --help" for help.' in result.output 28 | help_result = runner.invoke(zcool_command, ['--help']) 29 | assert help_result.exit_code == 0 30 | assert 'Show this message and exit.' in help_result.output 31 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2019 Allen Shaw 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | *.egg-info/ 24 | .installed.cfg 25 | *.egg 26 | MANIFEST 27 | 28 | # PyInstaller 29 | # Usually these files are written by a python script from a template 30 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 31 | *.manifest 32 | *.spec 33 | 34 | # Installer logs 35 | pip-log.txt 36 | pip-delete-this-directory.txt 37 | 38 | # Unit test / coverage reports 39 | htmlcov/ 40 | .tox/ 41 | .coverage 42 | .coverage.* 43 | .cache 44 | nosetests.xml 45 | coverage.xml 46 | *.cover 47 | .hypothesis/ 48 | .pytest_cache/ 49 | 50 | # Translations 51 | *.mo 52 | *.pot 53 | 54 | # Django stuff: 55 | *.log 56 | local_settings.py 57 | db.sqlite3 58 | 59 | # Flask stuff: 60 | instance/ 61 | .webassets-cache 62 | 63 | # Scrapy stuff: 64 | .scrapy 65 | 66 | # Sphinx documentation 67 | docs/_build/ 68 | 69 | # PyBuilder 70 | target/ 71 | 72 | # Jupyter Notebook 73 | .ipynb_checkpoints 74 | 75 | # pyenv 76 | .python-version 77 | 78 | # celery beat schedule file 79 | celerybeat-schedule 80 | 81 | # SageMath parsed files 82 | *.sage.py 83 | 84 | # Environments 85 | .env 86 | .venv 87 | env/ 88 | venv/ 89 | ENV/ 90 | env.bak/ 91 | venv.bak/ 92 | 93 | # Spyder project settings 94 | .spyderproject 95 | .spyproject 96 | 97 | # Rope project settings 98 | .ropeproject 99 | 100 | # mkdocs documentation 101 | /site 102 | 103 | # mypy 104 | .mypy_cache/ 105 | 106 | .idea 107 | www.zcool.com.cn/ 108 | www.cnu.cc/ 109 | -------------------------------------------------------------------------------- /scraper/utils.py: -------------------------------------------------------------------------------- 1 | # @FILENAME : utils 2 | # @AUTHOR : lonsty 3 | # @DATE : 2019/9/9 11:09 4 | import os 5 | import random 6 | import time 7 | from collections import namedtuple 8 | from functools import wraps 9 | from typing import Iterable 10 | 11 | 12 | def retry(exceptions, tries=3, delay=1, backoff=2, logger=None): 13 | """Retry calling the decorated function using an exponential backoff. 14 | 15 | :param exceptions: The exception to check. may be a tuple of exceptions to check. 16 | :param tries: Number of times to try (not retry) before giving up. 17 | :param delay: Initial delay between retries in seconds. 18 | :param backoff: Backoff multiplier (e.g. value of 2 will double the delay each retry). 19 | :param logger: Logger to use. If None, print. 20 | """ 21 | 22 | def deco_retry(f): 23 | 24 | @wraps(f) 25 | def f_retry(*args, **kwargs): 26 | mtries, mdelay = tries, delay or random.uniform(0.5, 1.5) 27 | while mtries > 1: 28 | try: 29 | return f(*args, **kwargs) 30 | except exceptions as e: 31 | if logger: 32 | logger.warning('{}, Retrying in {} seconds...'.format(e, mdelay)) 33 | # else: 34 | # print('{}, Retrying in {} seconds...'.format(e, mdelay)) 35 | time.sleep(mdelay) 36 | mtries -= 1 37 | mdelay *= backoff 38 | return f(*args, **kwargs) 39 | 40 | return f_retry # true decorator 41 | 42 | return deco_retry 43 | 44 | 45 | def mkdirs_if_not_exist(dir): 46 | """文件夹不存在时则创建。 47 | 48 | :param str dir: 文件夹路径,支持多级 49 | """ 50 | if not os.path.isdir(dir): 51 | try: 52 | os.makedirs(dir) 53 | return True 54 | except FileExistsError: 55 | pass 56 | 57 | 58 | def safe_filename(filename): 59 | """去掉文件名中的非法字符。 60 | 61 | :param str filename: 文件名 62 | :return str: 合法文件名 63 | """ 64 | return "".join([c for c in filename if c not in r'\/:*?"<>|']).strip() 65 | 66 | 67 | def parse_resources(ids, names, collections): 68 | """解析用户名或 ID。 69 | 70 | :param str ids: 半角逗号分隔的用户 ID 71 | :param str names: 半角逗号分隔的用户名 72 | :return list: 包含 User 数据的列表 73 | """ 74 | Resource = namedtuple('Resource', 'id name collection') 75 | resources = [] 76 | if collections: 77 | resources = [Resource(None, None, collection) for collection in collections.split(',')] 78 | elif names: 79 | resources = [Resource(None, name, None) for name in names.split(',')] 80 | elif ids: 81 | resources = [Resource(uid, None, None) for uid in ids.split(',')] 82 | return resources # TODO: 去重 83 | 84 | 85 | def sort_records(records: Iterable, order: dict): 86 | """根据自定义的排序规则排序 87 | 88 | :param Iterable records: 要排序的记录 89 | :param dict order: 自定义的排序 90 | :return: 91 | """ 92 | 93 | def _order_by(obj: namedtuple): 94 | if obj.type == 'topic': 95 | return (order[obj.type], obj.index, obj.objid, obj.title, obj.url) 96 | return (order[obj.type], obj.objid, obj.index, obj.title, obj.url) 97 | 98 | return sorted(records, key=_order_by) 99 | -------------------------------------------------------------------------------- /tests/test_multi_thread.py: -------------------------------------------------------------------------------- 1 | # @FILENAME : test_requests 2 | # @AUTHOR : lonsty 3 | # @DATE : 2020/12/27 14:44 4 | import os 5 | import threading 6 | from concurrent.futures import ThreadPoolExecutor, wait 7 | from pathlib import Path 8 | import traceback 9 | 10 | import requests 11 | 12 | # http://www.cnu.cc/works/427334 13 | # urls = [ 14 | # 'http://imgoss.cnu.cc/2010/30/994dfe50c509344eb3f5b525df642d60.jpg', 15 | # 'http://imgoss.cnu.cc/2010/30/4e695801d1c83c559991017cb8e2ff7a.jpg', 16 | # 'http://imgoss.cnu.cc/2010/30/07c3089cf6383bf596c59e92dc105412.jpg', 17 | # 'http://imgoss.cnu.cc/2010/30/a5b738e0f21737bb953e7407c8425772.jpg', 18 | # 'http://imgoss.cnu.cc/2010/30/24dd6e27c9cf38eab7daa72cc3e182e3.jpg', 19 | # 'http://imgoss.cnu.cc/2010/30/624cbd2e69313174b6ad13a9d2e75279.jpg', 20 | # 'http://imgoss.cnu.cc/2010/30/9786475b74733c91ba7f8b638468e299.jpg', 21 | # 'http://imgoss.cnu.cc/2010/30/75e085d6573c326dab3a8959736cb355.jpg', 22 | # 'http://imgoss.cnu.cc/2010/30/d28d0a7729353414a58e8fa10ff11fe8.jpg', 23 | # 'http://imgoss.cnu.cc/2010/30/669be1298a3e3546974977ed7c9655eb.jpg', 24 | # 'http://imgoss.cnu.cc/2010/30/ee82358fd6a537108d938ebba7cfe7de.jpg', 25 | # 'http://imgoss.cnu.cc/2010/30/007ce903478e3f6aa867a5789d213748.jpg', 26 | # 'http://imgoss.cnu.cc/2010/30/49549ca620873ba5bf71dd8c92d9e006.jpg', 27 | # 'http://imgoss.cnu.cc/2010/30/11426afdb0453d1b86bc7a2bb187bcef.jpg', 28 | # 'http://imgoss.cnu.cc/2010/30/39b4116ba53731f3994acddef431532c.jpg', 29 | # 'http://imgoss.cnu.cc/2010/30/7a7b6ae21eb13701a8f238aaedcf6d7d.jpg', 30 | # 'http://imgoss.cnu.cc/2010/30/e9ed54e368873d44b6b8fc276f0a6018.jpg', 31 | # 'http://imgoss.cnu.cc/2010/30/5f97c9f5ff6f353f814be1f9867ba6d0.jpg', 32 | # 'http://imgoss.cnu.cc/2010/30/42fb0868ca8434ec8c5f3460b96b2b5b.jpg', 33 | # 'http://imgoss.cnu.cc/2010/30/584e84b9e0253c80ac33f22a33adb9df.jpg' 34 | # ] 35 | 36 | # http://www.cnu.cc/works/435640 37 | urls = [ 38 | 'http://imgoss.cnu.cc/2012/25/pv6kqgjspuf4e9mefu01608863905044.jpg', 39 | 'http://imgoss.cnu.cc/2012/25/tpl70xav4fk8zsu55881608863905046.jpg', 40 | 'http://imgoss.cnu.cc/2012/25/9mkp309it3ub5qv9f6f1608863905046.jpg', 41 | 'http://imgoss.cnu.cc/2012/25/zb7f9qlpu75x2s53i821608863905047.jpg', 42 | 'http://imgoss.cnu.cc/2012/25/r2kxv11qltnruneqlpk1608863905047.jpg', 43 | 'http://imgoss.cnu.cc/2012/25/oqr644pxdcxeb404n1e1608863905048.jpg', 44 | 'http://imgoss.cnu.cc/2012/25/vjgt0am668sus1kvvcj1608863905048.jpg', 45 | 'http://imgoss.cnu.cc/2012/25/wjchu6v3en8iin2x3qf1608863905049.jpg', 46 | 'http://imgoss.cnu.cc/2012/25/jnq983zvv9k6iatdofo1608863905049.jpg' 47 | ] 48 | thread_local = threading.local() 49 | dest = Path('www.cnu.cc/冬日暖阳') 50 | 51 | 52 | def mkdirs_if_not_exist(dir): 53 | """文件夹不存在时则创建。 54 | 55 | :param str dir: 文件夹路径,支持多级 56 | """ 57 | if not os.path.isdir(dir): 58 | try: 59 | os.makedirs(dir) 60 | return True 61 | except FileExistsError: 62 | pass 63 | 64 | 65 | def get_session(): 66 | """使线程获取同一个 Session,可减少 TCP 连接数,加速请求。 67 | 68 | :return requests.Session: session 69 | """ 70 | if not hasattr(thread_local, "session"): 71 | thread_local.session = requests.Session() 72 | return thread_local.session 73 | 74 | 75 | def download_image(url): 76 | print(f'Downloading {url} ...') 77 | session = get_session() 78 | try: 79 | response = session.get(url, timeout=20) 80 | except Exception: 81 | print(traceback.format_exc()) 82 | return 83 | filepath = dest / url.split("/")[-1] 84 | with open(filepath, 'wb') as f: 85 | for chunk in response.iter_content(8192): 86 | f.write(chunk) 87 | print(f'Saved to {filepath}') 88 | 89 | 90 | if __name__ == '__main__': 91 | print('Start ...') 92 | os.makedirs(dest, exist_ok=True) 93 | with ThreadPoolExecutor(max_workers=10) as pool: 94 | futures = [pool.submit(download_image, url) for url in urls] 95 | wait(futures) 96 | print('Done.') 97 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # scraper 2 | 3 | 图片爬取下载工具,极速爬取下载 站酷([https://www.zcool.com.cn/](https://www.zcool.com.cn/))、CNU 视觉([http://www.cnu.cc/](http://www.cnu.cc/))`设计师/用户` 上传的 `图片/照片/插画`。 4 | 5 | **:tada: :tada: :tada: 站酷下载工具已发布到 PyPI** 6 | 7 | - 项目地址:[https://github.com/lonsty/zcooldl](https://github.com/lonsty/zcooldl) 8 | - 在线文档:[https://zcooldl.readthedocs.io/](https://zcooldl.readthedocs.io/) 9 | - 快速安装:`pip install -U zcooldl` 10 | - 使用方式:`zcooldl -u ` 11 | 12 | `scraper` 本来是规划用来存放各式各样的爬虫程序的。站酷仅仅是当初构想中的一个,因为太懒而没有新增其他爬虫。 13 | 想不到 [zcool.py](scraper/zcool.py) 竟然从原来的几十行代码,逐步增加到现在的 500+ 行 :joy: :joy: :joy:。 14 | 15 | 16 | ## 支持网站: 17 | 18 | | 网站 | 入口 | 示例 | 19 | |:---:|:---:|:---:| 20 | | [Zcool 站酷](https://www.zcool.com.cn/) | [zcool.py](zcool.py) | `python zcool.py -u 叁乔居` | 21 | | [CNU 视觉](http://www.cnu.cc/) | [cnu.py](cnu.py) | `python cnu.py http://www.cnu.cc/users/142231` | 22 | 23 | 24 | #### Zcool 站酷 25 | 26 | - [x] 极速下载:多线程异步下载,可以根据需要设置线程数 27 | - [x] 超清原图:默认下载超清原图(约几 MB),使用参数 `--thumbnail` 下载缩略图(宽最大 1280px,约 500KB) 28 | - [x] 下载收藏夹 `New`:使用 `-c <收藏夹 URL, ...>` 下载收藏夹中的作品(收藏夹可自由创建) 29 | 30 | #### CNU 视觉 31 | 32 | - [x] 下载 [视觉(CNU)](http://www.cnu.cc/) 作品 `New`:试用异步爬虫框架 [ruia](https://github.com/howie6879/ruia) 33 | - [x] 支持 3 类 URL 参数: 34 | 35 | - 作品集:[http://www.cnu.cc/works/117783](http://www.cnu.cc/works/117783) 36 | - 用户作品页:[http://www.cnu.cc/users/142231](http://www.cnu.cc/users/652629) 37 | - 用户推荐页:[http://www.cnu.cc/users/recommended/142231](http://www.cnu.cc/users/recommended/652629) 38 | 39 | 参数可接收多个 URL,且可随意组合,如: 40 | 41 | ```sh 42 | $ python cnu.py http://www.cnu.cc/works/117783 http://www.cnu.cc/users/652629 http://www.cnu.cc/users/recommended/652629 43 | ``` 44 | 45 | ### 环境: 46 | 47 | - `python3.6` 及以上 48 | 49 | # 快速使用 50 | 51 | 首先克隆项目到本地,并安装依赖: 52 | 53 | ```sh 54 | $ git clone https://github.com/lonsty/scraper.git 55 | 56 | $ cd scraper 57 | $ pip install -r requirements.txt 58 | ``` 59 | 60 | 1. 下载 [站酷(Zcool)](https://www.zcool.com.cn/)作品 61 | 62 | 下载用户名为 `username` 的所有图片到路径 `path` 下: 63 | 64 | ```sh 65 | $ python zcool.py -u -d 66 | ``` 67 | 68 | 运行截图 69 | 70 | ![screenshot_04.png](screenshots/04.png) 71 | 72 | ![screenshot_03.png](screenshots/03.png) 73 | 74 | ![screenshot_05.png](screenshots/05.png) 75 | 76 | 爬取结果 77 | 78 | ![screenshot_02.png](screenshots/02.png) 79 | 80 | 2. 下载 [视觉(CNU)](http://www.cnu.cc/)作品 81 | 82 | ```sh 83 | python cnu.py ... 84 | ``` 85 | 86 | # 使用帮助 87 | 88 | ### 常用命令 89 | 90 | 1. 只下载用户的**部分主题** 91 | 92 | ```sh 93 | $ python zcool.py -u -t ,,... 94 | ``` 95 | 96 | 2. 一次性下载**多个用户**的所有图片 97 | 98 | ```sh 99 | $ python zcool.py -u ,,... 100 | ``` 101 | 102 | 3. 部分图片**下载失败**或有**更新**,再执行相同的命令,对失败或新增的图片进行下载 103 | 104 | ```sh 105 | $ python zcool.py -u -d 106 | ``` 107 | 108 | ### 查看所有命令 109 | 110 | ```sh 111 | # Zcool 站酷 112 | $ python zcool.py --help 113 | 114 | Usage: zcool.py [OPTIONS] 115 | 116 | ZCool picture crawler, download pictures, photos and illustrations of 117 | ZCool (https://zcool.com.cn/). Visit https://github.com/lonsty/scraper. 118 | 119 | Options: 120 | -u, --usernames TEXT One or more user names, separated by commas. 121 | -i, --ids TEXT One or more user IDs, separated by commas. 122 | -c, --collections TEXT One or more collection URLs, separated by commas. 123 | -t, --topics TEXT Specific topics to download, separated by commas. 124 | -d, --destination TEXT Destination to save images. 125 | -R, --retries INTEGER Repeat download for failed images. [default: 3] 126 | -r, --redownload TEXT Redownload images from failed records (PATH of the 127 | .json file). 128 | -o, --overwrite Override the existing files. 129 | --thumbnail Download thumbnails with a maximum width of 1280px. 130 | --max-pages INTEGER Maximum pages to download. 131 | --max-topics INTEGER Maximum topics per page to download. 132 | --max-workers INTEGER Maximum thread workers. [default: 20] 133 | --help Show this message and exit. 134 | 135 | # CNU 视觉 136 | $ python cnu.py --help 137 | Usage: cnu.py [OPTIONS] START_URLS... 138 | 139 | A scraper to download images from http://www.cnu.cc/ 140 | 141 | Arguments: 142 | START_URLS... URLs of the works [required] 143 | 144 | Options: 145 | -d, --destination PATH Destination directory to save the images 146 | [default: .] 147 | 148 | -o, --overwrite / -no, --no-overwrite 149 | Whether to overwrite existing images 150 | [default: False] 151 | 152 | -t, --thumbnail Whether to download the thumbnail images 153 | [default: False] 154 | 155 | -r, --retries INTEGER Number of retries when the download fails 156 | [default: 3] 157 | 158 | -w, --workers INTEGER Number of parallel workers [default: 2] 159 | -c, --concurrency INTEGER Number of concurrency [default: 25] 160 | --delay INTEGER Seconds to wait for the next request 161 | [default: 0] 162 | 163 | --retry-delay INTEGER Seconds to wait for the retry request 164 | [default: 0] 165 | 166 | --timeout INTEGER Seconds of HTTP request timeout [default: 167 | 20] 168 | 169 | --install-completion [bash|zsh|fish|powershell|pwsh] 170 | Install completion for the specified shell. 171 | --show-completion [bash|zsh|fish|powershell|pwsh] 172 | Show completion for the specified shell, to 173 | copy it or customize the installation. 174 | 175 | --help Show this message and exit. 176 | ``` 177 | 178 | # 更新历史 179 | 180 | - ## 0.1.5 (2020-12-27) 181 | 182 | - 新增爬虫:使用 ruia 异步爬虫框架下载 [视觉(CNU)](http://www.cnu.cc/)作品 183 | 184 | - ## 0.1.4 (2020-11-30) 185 | 186 | - 新增功能:新参数 `-c <收藏夹 URL, ...>`,支持下载收藏夹中的作品。 187 | 188 | - ## 0.1.3 (2020-07-22) 189 | 190 | - 修复了在动态加载页面中无法获取并下载所有图片的问题 191 | - 保存的图片文件名中加入了序号,以保持原始顺序 192 | - 添加了注释,并对代码细节做了调整 193 | 194 | - ## 2020.03.25 195 | 196 | - 优化了终端输出信息,用不同颜色文字进行了标识 197 | - 修复了在低网速下无法下载图片的问题,并加快了整体下载速度 198 | 199 | - ## 0.1.2 (2020-03-24) 200 | 201 | 新功能: 202 | 203 | - 新增下载超清原图(默认选项,约几 MB),使用参数 `--thumbnail` 下载缩略图(宽最大 1280px,约 500KB) 204 | - 新增支持下载 JPG、PNG、GIF、BMP 格式的图片 205 | 206 | - ## 0.1.1 (2019-12-09) 207 | 208 | 新功能: 209 | 210 | - 可以选择下载用户的特定主题 211 | - 支持一次性输入多个用户名或 ID 212 | 213 | BUG 修复: 214 | 215 | - 修复用户如果没有上传任何图片时的下载错误 216 | 217 | - ## 0.1.0 (2019-09-09) 218 | 219 | 主要功能: 220 | 221 | - 极速下载:多线程异步下载,可以根据需要设置线程数 222 | - 异常重试:只要重试次数足够多,就没有下载不下来的图片 \(^o^)/ 223 | - 增量下载:设计师/用户有新的上传,再跑一遍程序就行了 O(∩_∩)O 嗯! 224 | - 支持代理:可以配置使用代理(0.1.3 版本后改为自动读取系统代理) 225 | 226 | # LICENSE 227 | 228 | 此项目使用 [MIT](LICENSE) 开源协议 229 | 230 | **注意**:使用此工具下载的所有作品,版权归原作者所有,请谨慎使用! 231 | -------------------------------------------------------------------------------- /scraper/cnu.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # @Author: eilianxiao 3 | # @Date: Dec 26 18:44 2020 4 | import json 5 | from pathlib import Path 6 | from typing import List 7 | 8 | import aiofiles 9 | import typer 10 | from ruia import AttrField, Item, Spider, TextField 11 | 12 | from scraper.utils import mkdirs_if_not_exist, safe_filename 13 | 14 | IMAGE_HOST = 'http://imgoss.cnu.cc/' 15 | AUTHOR_RCMDS_PREFIX = 'http://www.cnu.cc/users/recommended/' 16 | AUTHOR_WORKS_PREFIX = 'http://www.cnu.cc/users/' 17 | WORK_PREFIX = 'http://www.cnu.cc/works/' 18 | THUMBNAIL_SUFFIX = '?x-oss-process=style/content' 19 | PAGE_SUFFIX = '?page={page}' 20 | 21 | APP_NAME = 'CNU Scraper' 22 | BASE_DIR = 'www.cnu.cc' 23 | START_URLS = [ 24 | 'http://www.cnu.cc/works/{id}', # 作品集 URL 25 | 'http://www.cnu.cc/users/{id}', # 用户作品页 URL 26 | 'http://www.cnu.cc/users/recommended/{id}', # 用户推荐页 URL 27 | ] 28 | DESTINATION = Path('.') 29 | OVERWRITE = False 30 | THUMBNAIL = False 31 | WORKER_NUMBERS = 2 32 | CONCURRENCY = 25 33 | RETRIES = 3 34 | DELAY = 0 35 | RETRY_DELAY = 0 36 | TIMEOUT = 20 37 | 38 | 39 | class PageItem(Item): 40 | target_item = TextField(css_select='div.pager_box') 41 | max_page = TextField(css_select='ul>li:nth-last-child(2)', default=1) 42 | 43 | 44 | class WorkItem(Item): 45 | target_item = TextField(css_select='div.work-thumbnail') 46 | author = TextField(css_select='div.author') 47 | title = TextField(css_select='div.title') # WorkPage 中是日期 48 | work = AttrField(css_select='.thumbnail', attr='href') 49 | 50 | 51 | class ImagesItem(Item): 52 | target_item = TextField(css_select='body') 53 | author = TextField(css_select='.author-info strong') 54 | title = TextField(css_select='.work-title') 55 | imgs_json = TextField(css_select='#imgs_json') 56 | 57 | 58 | class CNUSpider(Spider): 59 | name = APP_NAME 60 | start_urls = START_URLS 61 | request_config = { 62 | 'RETRIES': RETRIES, 63 | 'DELAY': 0, 64 | 'TIMEOUT': TIMEOUT 65 | } 66 | concurrency = CONCURRENCY 67 | # aiohttp config 68 | aiohttp_kwargs = {} 69 | 70 | def __init__(self, *args, **kwargs): 71 | super().__init__(*args, **kwargs) 72 | self._destination = DESTINATION 73 | self._overwrite = OVERWRITE 74 | self._thumbnail = THUMBNAIL 75 | # 更新 Spider 及自定义的配置 76 | for k, v in kwargs.get('spider_config', {}).items(): 77 | setattr(self, k, v) 78 | 79 | async def parse(self, response): 80 | if response.url.startswith(AUTHOR_WORKS_PREFIX): 81 | async for page_item in PageItem.get_items(html=await response.text()): 82 | for page in range(1, int(page_item.max_page) + 1): 83 | page_url = f'{response.url.split("?")[0]}{PAGE_SUFFIX.format(page=page)}' 84 | yield self.request( 85 | url=page_url, 86 | metadata={ 87 | 'current_page': page, 88 | 'max_page': page_item.max_page, 89 | }, 90 | callback=self.parse_page) 91 | elif response.url.startswith(WORK_PREFIX): 92 | yield self.parse_work(response) 93 | else: 94 | self.logger.warning(f'Parser not support URL: {response.url}') 95 | 96 | async def parse_page(self, response): 97 | async for work_item in WorkItem.get_items(html=await response.text()): 98 | yield self.request( 99 | url=work_item.work, 100 | metadata={ 101 | 'current_page': response.metadata['current_page'], 102 | 'max_page': response.metadata['max_page'], 103 | 'author': work_item.author, 104 | 'title': work_item.title, 105 | 'work': work_item.work 106 | }, 107 | callback=self.parse_work 108 | ) 109 | 110 | async def parse_work(self, response): 111 | async for images_item in ImagesItem.get_items(html=await response.text()): 112 | urls = [IMAGE_HOST + img.get('img') for img in json.loads(images_item.imgs_json)] 113 | for index, url in enumerate(urls): 114 | basename = url.split('/')[-1] 115 | save_dir = (self._destination / 116 | BASE_DIR / 117 | safe_filename(images_item.author) / 118 | safe_filename(images_item.title)) 119 | fpath = save_dir / f'[{index + 1:02d}]{basename}' 120 | if self._overwrite or not fpath.is_file(): 121 | if self._thumbnail: 122 | url += THUMBNAIL_SUFFIX 123 | self.logger.info(f'Downloading {url} ...') 124 | yield self.request( 125 | url=url, 126 | metadata={ 127 | 'title': images_item.title, 128 | 'index': index, 129 | 'url': url, 130 | 'basename': basename, 131 | 'save_dir': save_dir, 132 | 'fpath': fpath 133 | }, 134 | callback=self.save_image 135 | ) 136 | else: 137 | self.logger.info(f'Skipped already exists: {fpath}') 138 | 139 | async def save_image(self, response): 140 | # 创建图片保存目录 141 | save_dir = response.metadata['save_dir'] 142 | if mkdirs_if_not_exist(save_dir): 143 | self.logger.info(f'Created directory: {save_dir}') 144 | # 保存图片 145 | fpath = response.metadata['fpath'] 146 | try: 147 | content = await response.read() 148 | except TypeError as e: 149 | self.logger.error(e) 150 | else: 151 | async with aiofiles.open(fpath, 'wb') as f: 152 | await f.write(content) 153 | self.logger.info(f'Saved to {fpath}') 154 | 155 | 156 | def cnu_command( 157 | start_urls: List[str] = typer.Argument( 158 | ..., 159 | help='URLs of the works' 160 | ), 161 | destination: Path = typer.Option( 162 | DESTINATION, '-d', '--destination', 163 | help='Destination directory to save the images' 164 | ), 165 | overwrite: bool = typer.Option( 166 | OVERWRITE, '-o / -no', '--overwrite / --no-overwrite', 167 | help='Whether to overwrite existing images' 168 | ), 169 | thumbnail: bool = typer.Option( 170 | THUMBNAIL, '-t', '--thumbnail', 171 | help='Whether to download the thumbnail images' 172 | ), 173 | retries: int = typer.Option( 174 | RETRIES, '-r', '--retries', 175 | help='Number of retries when the download fails' 176 | ), 177 | worker_numbers: int = typer.Option( 178 | WORKER_NUMBERS, '-w', '--workers', 179 | help='Number of parallel workers' 180 | ), 181 | concurrency: int = typer.Option( 182 | CONCURRENCY, '-c', '--concurrency', 183 | help='Number of concurrency' 184 | ), 185 | delay: int = typer.Option( 186 | DELAY, '--delay', 187 | help='Seconds to wait for the next request' 188 | ), 189 | retry_delay: int = typer.Option( 190 | RETRY_DELAY, '--retry-delay', 191 | help='Seconds to wait for the retry request' 192 | ), 193 | timeout: int = typer.Option( 194 | TIMEOUT, '--timeout', 195 | help='Seconds of HTTP request timeout' 196 | ), 197 | ): 198 | """ A scraper to download images from http://www.cnu.cc/""" 199 | # 开始爬虫任务 200 | CNUSpider.start( 201 | spider_config=dict( 202 | start_urls=list(start_urls), 203 | request_config={ 204 | 'RETRIES': retries, 205 | 'DELAY': delay, 206 | 'RETRY_DELAY': retry_delay, 207 | 'TIMEOUT': timeout 208 | }, 209 | _destination=destination, 210 | _overwrite=overwrite, 211 | _thumbnail=thumbnail, 212 | worker_numbers=worker_numbers, 213 | concurrency=concurrency 214 | ) 215 | ) 216 | -------------------------------------------------------------------------------- /Pipfile.lock: -------------------------------------------------------------------------------- 1 | { 2 | "_meta": { 3 | "hash": { 4 | "sha256": "9076dc5aca307d83edc51f48f924cd9fd15942e012907c64fd1164885fef70a1" 5 | }, 6 | "pipfile-spec": 6, 7 | "requires": { 8 | "python_version": "3.6" 9 | }, 10 | "sources": [ 11 | { 12 | "name": "pypi", 13 | "url": "https://pypi.org/simple", 14 | "verify_ssl": true 15 | } 16 | ] 17 | }, 18 | "default": { 19 | "aiofiles": { 20 | "hashes": [ 21 | "sha256:bd3019af67f83b739f8e4053c6c0512a7f545b9a8d91aaeab55e6e0f9d123c27", 22 | "sha256:e0281b157d3d5d59d803e3f4557dcc9a3dff28a4dd4829a9ff478adae50ca092" 23 | ], 24 | "index": "pypi", 25 | "version": "==0.6.0" 26 | }, 27 | "aiohttp": { 28 | "hashes": [ 29 | "sha256:119feb2bd551e58d83d1b38bfa4cb921af8ddedec9fad7183132db334c3133e0", 30 | "sha256:16d0683ef8a6d803207f02b899c928223eb219111bd52420ef3d7a8aa76227b6", 31 | "sha256:2eb3efe243e0f4ecbb654b08444ae6ffab37ac0ef8f69d3a2ffb958905379daf", 32 | "sha256:2ffea7904e70350da429568113ae422c88d2234ae776519549513c8f217f58a9", 33 | "sha256:40bd1b101b71a18a528ffce812cc14ff77d4a2a1272dfb8b11b200967489ef3e", 34 | "sha256:418597633b5cd9639e514b1d748f358832c08cd5d9ef0870026535bd5eaefdd0", 35 | "sha256:481d4b96969fbfdcc3ff35eea5305d8565a8300410d3d269ccac69e7256b1329", 36 | "sha256:4c1bdbfdd231a20eee3e56bd0ac1cd88c4ff41b64ab679ed65b75c9c74b6c5c2", 37 | "sha256:5563ad7fde451b1986d42b9bb9140e2599ecf4f8e42241f6da0d3d624b776f40", 38 | "sha256:58c62152c4c8731a3152e7e650b29ace18304d086cb5552d317a54ff2749d32a", 39 | "sha256:5b50e0b9460100fe05d7472264d1975f21ac007b35dcd6fd50279b72925a27f4", 40 | "sha256:5d84ecc73141d0a0d61ece0742bb7ff5751b0657dab8405f899d3ceb104cc7de", 41 | "sha256:5dde6d24bacac480be03f4f864e9a67faac5032e28841b00533cd168ab39cad9", 42 | "sha256:5e91e927003d1ed9283dee9abcb989334fc8e72cf89ebe94dc3e07e3ff0b11e9", 43 | "sha256:62bc216eafac3204877241569209d9ba6226185aa6d561c19159f2e1cbb6abfb", 44 | "sha256:6c8200abc9dc5f27203986100579fc19ccad7a832c07d2bc151ce4ff17190076", 45 | "sha256:6ca56bdfaf825f4439e9e3673775e1032d8b6ea63b8953d3812c71bd6a8b81de", 46 | "sha256:71680321a8a7176a58dfbc230789790639db78dad61a6e120b39f314f43f1907", 47 | "sha256:7c7820099e8b3171e54e7eedc33e9450afe7cd08172632d32128bd527f8cb77d", 48 | "sha256:7dbd087ff2f4046b9b37ba28ed73f15fd0bc9f4fdc8ef6781913da7f808d9536", 49 | "sha256:822bd4fd21abaa7b28d65fc9871ecabaddc42767884a626317ef5b75c20e8a2d", 50 | "sha256:8ec1a38074f68d66ccb467ed9a673a726bb397142c273f90d4ba954666e87d54", 51 | "sha256:950b7ef08b2afdab2488ee2edaff92a03ca500a48f1e1aaa5900e73d6cf992bc", 52 | "sha256:99c5a5bf7135607959441b7d720d96c8e5c46a1f96e9d6d4c9498be8d5f24212", 53 | "sha256:b84ad94868e1e6a5e30d30ec419956042815dfaea1b1df1cef623e4564c374d9", 54 | "sha256:bc3d14bf71a3fb94e5acf5bbf67331ab335467129af6416a437bd6024e4f743d", 55 | "sha256:c2a80fd9a8d7e41b4e38ea9fe149deed0d6aaede255c497e66b8213274d6d61b", 56 | "sha256:c44d3c82a933c6cbc21039326767e778eface44fca55c65719921c4b9661a3f7", 57 | "sha256:cc31e906be1cc121ee201adbdf844522ea3349600dd0a40366611ca18cd40e81", 58 | "sha256:d5d102e945ecca93bcd9801a7bb2fa703e37ad188a2f81b1e65e4abe4b51b00c", 59 | "sha256:dd7936f2a6daa861143e376b3a1fb56e9b802f4980923594edd9ca5670974895", 60 | "sha256:dee68ec462ff10c1d836c0ea2642116aba6151c6880b688e56b4c0246770f297", 61 | "sha256:e76e78863a4eaec3aee5722d85d04dcbd9844bc6cd3bfa6aa880ff46ad16bfcb", 62 | "sha256:eab51036cac2da8a50d7ff0ea30be47750547c9aa1aa2cf1a1b710a1827e7dbe", 63 | "sha256:f4496d8d04da2e98cc9133e238ccebf6a13ef39a93da2e87146c8c8ac9768242", 64 | "sha256:fbd3b5e18d34683decc00d9a360179ac1e7a320a5fee10ab8053ffd6deab76e0", 65 | "sha256:feb24ff1226beeb056e247cf2e24bba5232519efb5645121c4aea5b6ad74c1f2" 66 | ], 67 | "index": "pypi", 68 | "version": "==3.7.4" 69 | }, 70 | "async-timeout": { 71 | "hashes": [ 72 | "sha256:0c3c816a028d47f659d6ff5c745cb2acf1f966da1fe5c19c77a70282b25f4c5f", 73 | "sha256:4291ca197d287d274d0b6cb5d6f8f8f82d434ed288f962539ff18cc9012f9ea3" 74 | ], 75 | "version": "==3.0.1" 76 | }, 77 | "attrs": { 78 | "hashes": [ 79 | "sha256:31b2eced602aa8423c2aea9c76a724617ed67cf9513173fd3a4f03e3a929c7e6", 80 | "sha256:832aa3cde19744e49938b91fea06d69ecb9e649c93ba974535d08ad92164f700" 81 | ], 82 | "version": "==20.3.0" 83 | }, 84 | "beautifulsoup4": { 85 | "hashes": [ 86 | "sha256:4c98143716ef1cb40bf7f39a8e3eec8f8b009509e74904ba3a7b315431577e35", 87 | "sha256:84729e322ad1d5b4d25f805bfa05b902dd96450f43842c4e99067d5e1369eb25", 88 | "sha256:fff47e031e34ec82bf17e00da8f592fe7de69aeea38be00523c04623c04fb666" 89 | ], 90 | "index": "pypi", 91 | "version": "==4.9.3" 92 | }, 93 | "certifi": { 94 | "hashes": [ 95 | "sha256:1a4995114262bffbc2413b159f2a1a480c969de6e6eb13ee966d470af86af59c", 96 | "sha256:719a74fb9e33b9bd44cc7f3a8d94bc35e4049deebe19ba7d8e108280cfd59830" 97 | ], 98 | "version": "==2020.12.5" 99 | }, 100 | "chardet": { 101 | "hashes": [ 102 | "sha256:84ab92ed1c4d4f16916e05906b6b75a6c0fb5db821cc65e70cbd64a3e2a5eaae", 103 | "sha256:fc323ffcaeaed0e0a02bf4d117757b98aed530d9ed4531e3e15460124c106691" 104 | ], 105 | "version": "==3.0.4" 106 | }, 107 | "click": { 108 | "hashes": [ 109 | "sha256:d2b5255c7c6349bc1bd1e59e08cd12acbbd63ce649f2588755783aa94dfb6b1a", 110 | "sha256:dacca89f4bfadd5de3d7489b7c8a566eee0d3676333fbb50030263894c38c0dc" 111 | ], 112 | "index": "pypi", 113 | "version": "==7.1.2" 114 | }, 115 | "cssselect": { 116 | "hashes": [ 117 | "sha256:f612ee47b749c877ebae5bb77035d8f4202c6ad0f0fc1271b3c18ad6c4468ecf", 118 | "sha256:f95f8dedd925fd8f54edb3d2dfb44c190d9d18512377d3c1e2388d16126879bc" 119 | ], 120 | "version": "==1.1.0" 121 | }, 122 | "idna": { 123 | "hashes": [ 124 | "sha256:b307872f855b18632ce0c21c5e45be78c0ea7ae4c15c828c20788b26921eb3f6", 125 | "sha256:b97d804b1e9b523befed77c48dacec60e6dcb0b5391d57af6a65a312a90648c0" 126 | ], 127 | "version": "==2.10" 128 | }, 129 | "idna-ssl": { 130 | "hashes": [ 131 | "sha256:a933e3bb13da54383f9e8f35dc4f9cb9eb9b3b78c6b36f311254d6d0d92c6c7c" 132 | ], 133 | "markers": "python_version < '3.7'", 134 | "version": "==1.1.0" 135 | }, 136 | "lxml": { 137 | "hashes": [ 138 | "sha256:0448576c148c129594d890265b1a83b9cd76fd1f0a6a04620753d9a6bcfd0a4d", 139 | "sha256:127f76864468d6630e1b453d3ffbbd04b024c674f55cf0a30dc2595137892d37", 140 | "sha256:1471cee35eba321827d7d53d104e7b8c593ea3ad376aa2df89533ce8e1b24a01", 141 | "sha256:2363c35637d2d9d6f26f60a208819e7eafc4305ce39dc1d5005eccc4593331c2", 142 | "sha256:2e5cc908fe43fe1aa299e58046ad66981131a66aea3129aac7770c37f590a644", 143 | "sha256:2e6fd1b8acd005bd71e6c94f30c055594bbd0aa02ef51a22bbfa961ab63b2d75", 144 | "sha256:366cb750140f221523fa062d641393092813b81e15d0e25d9f7c6025f910ee80", 145 | "sha256:42ebca24ba2a21065fb546f3e6bd0c58c3fe9ac298f3a320147029a4850f51a2", 146 | "sha256:4e751e77006da34643ab782e4a5cc21ea7b755551db202bc4d3a423b307db780", 147 | "sha256:4fb85c447e288df535b17ebdebf0ec1cf3a3f1a8eba7e79169f4f37af43c6b98", 148 | "sha256:50c348995b47b5a4e330362cf39fc503b4a43b14a91c34c83b955e1805c8e308", 149 | "sha256:535332fe9d00c3cd455bd3dd7d4bacab86e2d564bdf7606079160fa6251caacf", 150 | "sha256:535f067002b0fd1a4e5296a8f1bf88193080ff992a195e66964ef2a6cfec5388", 151 | "sha256:5be4a2e212bb6aa045e37f7d48e3e1e4b6fd259882ed5a00786f82e8c37ce77d", 152 | "sha256:60a20bfc3bd234d54d49c388950195d23a5583d4108e1a1d47c9eef8d8c042b3", 153 | "sha256:648914abafe67f11be7d93c1a546068f8eff3c5fa938e1f94509e4a5d682b2d8", 154 | "sha256:681d75e1a38a69f1e64ab82fe4b1ed3fd758717bed735fb9aeaa124143f051af", 155 | "sha256:68a5d77e440df94011214b7db907ec8f19e439507a70c958f750c18d88f995d2", 156 | "sha256:69a63f83e88138ab7642d8f61418cf3180a4d8cd13995df87725cb8b893e950e", 157 | "sha256:6e4183800f16f3679076dfa8abf2db3083919d7e30764a069fb66b2b9eff9939", 158 | "sha256:6fd8d5903c2e53f49e99359b063df27fdf7acb89a52b6a12494208bf61345a03", 159 | "sha256:791394449e98243839fa822a637177dd42a95f4883ad3dec2a0ce6ac99fb0a9d", 160 | "sha256:7a7669ff50f41225ca5d6ee0a1ec8413f3a0d8aa2b109f86d540887b7ec0d72a", 161 | "sha256:7e9eac1e526386df7c70ef253b792a0a12dd86d833b1d329e038c7a235dfceb5", 162 | "sha256:7ee8af0b9f7de635c61cdd5b8534b76c52cd03536f29f51151b377f76e214a1a", 163 | "sha256:8246f30ca34dc712ab07e51dc34fea883c00b7ccb0e614651e49da2c49a30711", 164 | "sha256:8c88b599e226994ad4db29d93bc149aa1aff3dc3a4355dd5757569ba78632bdf", 165 | "sha256:923963e989ffbceaa210ac37afc9b906acebe945d2723e9679b643513837b089", 166 | "sha256:94d55bd03d8671686e3f012577d9caa5421a07286dd351dfef64791cf7c6c505", 167 | "sha256:97db258793d193c7b62d4e2586c6ed98d51086e93f9a3af2b2034af01450a74b", 168 | "sha256:a9d6bc8642e2c67db33f1247a77c53476f3a166e09067c0474facb045756087f", 169 | "sha256:cd11c7e8d21af997ee8079037fff88f16fda188a9776eb4b81c7e4c9c0a7d7fc", 170 | "sha256:d8d3d4713f0c28bdc6c806a278d998546e8efc3498949e3ace6e117462ac0a5e", 171 | "sha256:e0bfe9bb028974a481410432dbe1b182e8191d5d40382e5b8ff39cdd2e5c5931", 172 | "sha256:f4822c0660c3754f1a41a655e37cb4dbbc9be3d35b125a37fab6f82d47674ebc", 173 | "sha256:f83d281bb2a6217cd806f4cf0ddded436790e66f393e124dfe9731f6b3fb9afe", 174 | "sha256:fc37870d6716b137e80d19241d0e2cff7a7643b925dfa49b4c8ebd1295eb506e" 175 | ], 176 | "version": "==4.6.2" 177 | }, 178 | "multidict": { 179 | "hashes": [ 180 | "sha256:018132dbd8688c7a69ad89c4a3f39ea2f9f33302ebe567a879da8f4ca73f0d0a", 181 | "sha256:051012ccee979b2b06be928a6150d237aec75dd6bf2d1eeeb190baf2b05abc93", 182 | "sha256:05c20b68e512166fddba59a918773ba002fdd77800cad9f55b59790030bab632", 183 | "sha256:07b42215124aedecc6083f1ce6b7e5ec5b50047afa701f3442054373a6deb656", 184 | "sha256:0e3c84e6c67eba89c2dbcee08504ba8644ab4284863452450520dad8f1e89b79", 185 | "sha256:0e929169f9c090dae0646a011c8b058e5e5fb391466016b39d21745b48817fd7", 186 | "sha256:1ab820665e67373de5802acae069a6a05567ae234ddb129f31d290fc3d1aa56d", 187 | "sha256:25b4e5f22d3a37ddf3effc0710ba692cfc792c2b9edfb9c05aefe823256e84d5", 188 | "sha256:2e68965192c4ea61fff1b81c14ff712fc7dc15d2bd120602e4a3494ea6584224", 189 | "sha256:2f1a132f1c88724674271d636e6b7351477c27722f2ed789f719f9e3545a3d26", 190 | "sha256:37e5438e1c78931df5d3c0c78ae049092877e5e9c02dd1ff5abb9cf27a5914ea", 191 | "sha256:3a041b76d13706b7fff23b9fc83117c7b8fe8d5fe9e6be45eee72b9baa75f348", 192 | "sha256:3a4f32116f8f72ecf2a29dabfb27b23ab7cdc0ba807e8459e59a93a9be9506f6", 193 | "sha256:46c73e09ad374a6d876c599f2328161bcd95e280f84d2060cf57991dec5cfe76", 194 | "sha256:46dd362c2f045095c920162e9307de5ffd0a1bfbba0a6e990b344366f55a30c1", 195 | "sha256:4b186eb7d6ae7c06eb4392411189469e6a820da81447f46c0072a41c748ab73f", 196 | "sha256:54fd1e83a184e19c598d5e70ba508196fd0bbdd676ce159feb412a4a6664f952", 197 | "sha256:585fd452dd7782130d112f7ddf3473ffdd521414674c33876187e101b588738a", 198 | "sha256:5cf3443199b83ed9e955f511b5b241fd3ae004e3cb81c58ec10f4fe47c7dce37", 199 | "sha256:6a4d5ce640e37b0efcc8441caeea8f43a06addace2335bd11151bc02d2ee31f9", 200 | "sha256:7df80d07818b385f3129180369079bd6934cf70469f99daaebfac89dca288359", 201 | "sha256:806068d4f86cb06af37cd65821554f98240a19ce646d3cd24e1c33587f313eb8", 202 | "sha256:830f57206cc96ed0ccf68304141fec9481a096c4d2e2831f311bde1c404401da", 203 | "sha256:929006d3c2d923788ba153ad0de8ed2e5ed39fdbe8e7be21e2f22ed06c6783d3", 204 | "sha256:9436dc58c123f07b230383083855593550c4d301d2532045a17ccf6eca505f6d", 205 | "sha256:9dd6e9b1a913d096ac95d0399bd737e00f2af1e1594a787e00f7975778c8b2bf", 206 | "sha256:ace010325c787c378afd7f7c1ac66b26313b3344628652eacd149bdd23c68841", 207 | "sha256:b47a43177a5e65b771b80db71e7be76c0ba23cc8aa73eeeb089ed5219cdbe27d", 208 | "sha256:b797515be8743b771aa868f83563f789bbd4b236659ba52243b735d80b29ed93", 209 | "sha256:b7993704f1a4b204e71debe6095150d43b2ee6150fa4f44d6d966ec356a8d61f", 210 | "sha256:d5c65bdf4484872c4af3150aeebe101ba560dcfb34488d9a8ff8dbcd21079647", 211 | "sha256:d81eddcb12d608cc08081fa88d046c78afb1bf8107e6feab5d43503fea74a635", 212 | "sha256:dc862056f76443a0db4509116c5cd480fe1b6a2d45512a653f9a855cc0517456", 213 | "sha256:ecc771ab628ea281517e24fd2c52e8f31c41e66652d07599ad8818abaad38cda", 214 | "sha256:f200755768dc19c6f4e2b672421e0ebb3dd54c38d5a4f262b872d8cfcc9e93b5", 215 | "sha256:f21756997ad8ef815d8ef3d34edd98804ab5ea337feedcd62fb52d22bf531281", 216 | "sha256:fc13a9524bc18b6fb6e0dbec3533ba0496bbed167c56d0aabefd965584557d80" 217 | ], 218 | "version": "==5.1.0" 219 | }, 220 | "requests": { 221 | "hashes": [ 222 | "sha256:27973dd4a904a4f13b263a19c866c13b92a39ed1c964655f025f3f8d3d75b804", 223 | "sha256:c210084e36a42ae6b9219e00e48287def368a26d03a048ddad7bfee44f75871e" 224 | ], 225 | "index": "pypi", 226 | "version": "==2.25.1" 227 | }, 228 | "ruia": { 229 | "extras": [ 230 | "uvloop" 231 | ], 232 | "hashes": [ 233 | "sha256:79585cb92862462547959a1da299fca4c76cbe16d72bc8d4a93e65c037f2431e" 234 | ], 235 | "index": "pypi", 236 | "version": "==0.8.0" 237 | }, 238 | "soupsieve": { 239 | "hashes": [ 240 | "sha256:407fa1e8eb3458d1b5614df51d9651a1180ea5fedf07feb46e45d7e25e6d6cdd", 241 | "sha256:d3a5ea5b350423f47d07639f74475afedad48cf41c0ad7a82ca13a3928af34f6" 242 | ], 243 | "markers": "python_version >= '3.0'", 244 | "version": "==2.2" 245 | }, 246 | "termcolor": { 247 | "hashes": [ 248 | "sha256:1d6d69ce66211143803fbc56652b41d73b4a400a2891d7bf7a1cdf4c02de613b" 249 | ], 250 | "index": "pypi", 251 | "version": "==1.1.0" 252 | }, 253 | "typer": { 254 | "hashes": [ 255 | "sha256:5455d750122cff96745b0dec87368f56d023725a7ebc9d2e54dd23dc86816303", 256 | "sha256:ba58b920ce851b12a2d790143009fa00ac1d05b3ff3257061ff69dbdfc3d161b" 257 | ], 258 | "index": "pypi", 259 | "version": "==0.3.2" 260 | }, 261 | "typing-extensions": { 262 | "hashes": [ 263 | "sha256:7cb407020f00f7bfc3cb3e7881628838e69d8f3fcab2f64742a5e76b2f841918", 264 | "sha256:99d4073b617d30288f569d3f13d2bd7548c3a7e4c8de87db09a9d29bb3a4a60c", 265 | "sha256:dafc7639cde7f1b6e1acc0f457842a83e722ccca8eef5270af2d74792619a89f" 266 | ], 267 | "markers": "python_version < '3.8'", 268 | "version": "==3.7.4.3" 269 | }, 270 | "urllib3": { 271 | "hashes": [ 272 | "sha256:1b465e494e3e0d8939b50680403e3aedaa2bc434b7d5af64dfd3c958d7f5ae80", 273 | "sha256:de3eedaad74a2683334e282005cd8d7f22f4d55fa690a2a1020a416cb0a47e73" 274 | ], 275 | "version": "==1.26.3" 276 | }, 277 | "uvloop": { 278 | "hashes": [ 279 | "sha256:1ae1ad731c8c0dcee80e0ecf06274f0f7293244d2cef81fa2747321a370a6aba", 280 | "sha256:236a3c31096e0845029856f7bc07a938340c2cdb35d9d39b38c9253b672bf948", 281 | "sha256:47ec567151070ed770211d359ad9250b59368548c60212c7ef6dda3f5b1778f6", 282 | "sha256:66881fe8a2187334c4dd5010c56310bdf32fe426613f9ca727f090bc31280624", 283 | "sha256:7846828112bfb49abc5fdfc47d0e4dfd7402115c9fde3c14c31818cfbeeb63dc", 284 | "sha256:9541dc3f391941796ae95c9c3bb16b813acf9e3d4beebfd3b623f1acb22d318d", 285 | "sha256:ca8a9e982f0bfbe331f41902cdd721c6e749e4685a403685e792b86a584f5969", 286 | "sha256:e178c255622d928d464187e3ceba94db88465f6b17909c651483fb73af8d8b85", 287 | "sha256:e72779681f839b6a069d7e7a9f7962a1d1927612c5c2e33071415478bdc1b91b", 288 | "sha256:ed073d24e0c383c24d17d3a2bb209b999ff0a8130e89b7c3f033db9e0c3bd04f" 289 | ], 290 | "version": "==0.15.1" 291 | }, 292 | "yarl": { 293 | "hashes": [ 294 | "sha256:00d7ad91b6583602eb9c1d085a2cf281ada267e9a197e8b7cae487dadbfa293e", 295 | "sha256:0355a701b3998dcd832d0dc47cc5dedf3874f966ac7f870e0f3a6788d802d434", 296 | "sha256:15263c3b0b47968c1d90daa89f21fcc889bb4b1aac5555580d74565de6836366", 297 | "sha256:2ce4c621d21326a4a5500c25031e102af589edb50c09b321049e388b3934eec3", 298 | "sha256:31ede6e8c4329fb81c86706ba8f6bf661a924b53ba191b27aa5fcee5714d18ec", 299 | "sha256:324ba3d3c6fee56e2e0b0d09bf5c73824b9f08234339d2b788af65e60040c959", 300 | "sha256:329412812ecfc94a57cd37c9d547579510a9e83c516bc069470db5f75684629e", 301 | "sha256:4736eaee5626db8d9cda9eb5282028cc834e2aeb194e0d8b50217d707e98bb5c", 302 | "sha256:4953fb0b4fdb7e08b2f3b3be80a00d28c5c8a2056bb066169de00e6501b986b6", 303 | "sha256:4c5bcfc3ed226bf6419f7a33982fb4b8ec2e45785a0561eb99274ebbf09fdd6a", 304 | "sha256:547f7665ad50fa8563150ed079f8e805e63dd85def6674c97efd78eed6c224a6", 305 | "sha256:5b883e458058f8d6099e4420f0cc2567989032b5f34b271c0827de9f1079a424", 306 | "sha256:63f90b20ca654b3ecc7a8d62c03ffa46999595f0167d6450fa8383bab252987e", 307 | "sha256:68dc568889b1c13f1e4745c96b931cc94fdd0defe92a72c2b8ce01091b22e35f", 308 | "sha256:69ee97c71fee1f63d04c945f56d5d726483c4762845400a6795a3b75d56b6c50", 309 | "sha256:6d6283d8e0631b617edf0fd726353cb76630b83a089a40933043894e7f6721e2", 310 | "sha256:72a660bdd24497e3e84f5519e57a9ee9220b6f3ac4d45056961bf22838ce20cc", 311 | "sha256:73494d5b71099ae8cb8754f1df131c11d433b387efab7b51849e7e1e851f07a4", 312 | "sha256:7356644cbed76119d0b6bd32ffba704d30d747e0c217109d7979a7bc36c4d970", 313 | "sha256:8a9066529240171b68893d60dca86a763eae2139dd42f42106b03cf4b426bf10", 314 | "sha256:8aa3decd5e0e852dc68335abf5478a518b41bf2ab2f330fe44916399efedfae0", 315 | "sha256:97b5bdc450d63c3ba30a127d018b866ea94e65655efaf889ebeabc20f7d12406", 316 | "sha256:9ede61b0854e267fd565e7527e2f2eb3ef8858b301319be0604177690e1a3896", 317 | "sha256:b2e9a456c121e26d13c29251f8267541bd75e6a1ccf9e859179701c36a078643", 318 | "sha256:b5dfc9a40c198334f4f3f55880ecf910adebdcb2a0b9a9c23c9345faa9185721", 319 | "sha256:bafb450deef6861815ed579c7a6113a879a6ef58aed4c3a4be54400ae8871478", 320 | "sha256:c49ff66d479d38ab863c50f7bb27dee97c6627c5fe60697de15529da9c3de724", 321 | "sha256:ce3beb46a72d9f2190f9e1027886bfc513702d748047b548b05dab7dfb584d2e", 322 | "sha256:d26608cf178efb8faa5ff0f2d2e77c208f471c5a3709e577a7b3fd0445703ac8", 323 | "sha256:d597767fcd2c3dc49d6eea360c458b65643d1e4dbed91361cf5e36e53c1f8c96", 324 | "sha256:d5c32c82990e4ac4d8150fd7652b972216b204de4e83a122546dce571c1bdf25", 325 | "sha256:d8d07d102f17b68966e2de0e07bfd6e139c7c02ef06d3a0f8d2f0f055e13bb76", 326 | "sha256:e46fba844f4895b36f4c398c5af062a9808d1f26b2999c58909517384d5deda2", 327 | "sha256:e6b5460dc5ad42ad2b36cca524491dfcaffbfd9c8df50508bddc354e787b8dc2", 328 | "sha256:f040bcc6725c821a4c0665f3aa96a4d0805a7aaf2caf266d256b8ed71b9f041c", 329 | "sha256:f0b059678fd549c66b89bed03efcabb009075bd131c248ecdf087bdb6faba24a", 330 | "sha256:fcbb48a93e8699eae920f8d92f7160c03567b421bc17362a9ffbbd706a816f71" 331 | ], 332 | "version": "==1.6.3" 333 | } 334 | }, 335 | "develop": {} 336 | } 337 | -------------------------------------------------------------------------------- /scraper/zcool.py: -------------------------------------------------------------------------------- 1 | # @AUTHOR: lonsty 2 | # @DATE: 2019-09-07 18:34:18 3 | import json 4 | import math 5 | import os.path as op 6 | import re 7 | import sys 8 | import threading 9 | import time 10 | from collections import namedtuple 11 | from concurrent.futures import ThreadPoolExecutor, as_completed, wait 12 | from datetime import datetime 13 | from pathlib import Path 14 | from queue import Empty, Queue 15 | from typing import List 16 | from urllib.parse import urljoin, urlparse 17 | from uuid import uuid4 18 | 19 | import click 20 | import requests 21 | from bs4 import BeautifulSoup 22 | from termcolor import colored, cprint 23 | 24 | from scraper.utils import (mkdirs_if_not_exist, parse_resources, retry, 25 | safe_filename, sort_records) 26 | 27 | Scrapy = namedtuple('Scrapy', 'type author title objid index url') # 用于记录下载任务 28 | HEADERS = { 29 | 'user-agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 ' 30 | '(KHTML, like Gecko) Chrome/83.0.4103.116 Safari/537.36' 31 | } 32 | HOST_PAGE = 'https://www.zcool.com.cn' 33 | SEARCH_DESIGNER_SUFFIX = '/search/designer?&word={word}' 34 | USER_SUFFIX = '/u/{id}' 35 | PAGE_SUFFIX = '?myCate=0&sort=1&p={page}' 36 | WORK_SUFFIX = '/work/content/show?p=1&objectId={objid}' 37 | COLLECTION_SUFFIX = '/collection/contents?id={objid}&p={page}&pageSize=25' 38 | USER_API = 'https://www.zcool.com.cn/member/card/{id}' 39 | TIMEOUT = 30 40 | Q_TIMEOUT = 1 41 | MAX_WORKERS = 20 42 | RETRIES = 3 43 | 44 | thread_local = threading.local() 45 | 46 | 47 | def get_session(): 48 | """使线程获取同一个 Session,可减少 TCP 连接数,加速请求。 49 | 50 | :return requests.Session: session 51 | """ 52 | if not hasattr(thread_local, "session"): 53 | thread_local.session = requests.Session() 54 | return thread_local.session 55 | 56 | 57 | @retry(Exception, tries=RETRIES) 58 | def session_request(url: str, method: str = 'GET') -> requests.Response: 59 | """使用 session 请求数据。使用了装饰器 retry,在网络异常导致错误时会重试。 60 | 61 | :param str url: 目标请求 URL 62 | :param str method: 请求方式 63 | :return requests.Response: 响应数据 64 | """ 65 | resp = get_session().request(method, url, headers=HEADERS, timeout=TIMEOUT) 66 | resp.raise_for_status() 67 | return resp 68 | 69 | 70 | class ZCoolScraper(): 71 | 72 | def __init__(self, user_id=None, username=None, collection=None, destination=None, 73 | max_pages=None, spec_topics=None, max_topics=None, max_workers=None, 74 | retries=None, redownload=None, overwrite=False, thumbnail=False): 75 | """初始化下载参数。 76 | 77 | :param int user_id: 用户 ID 78 | :param str username: 用户名 79 | :param HttpUrl collection: 收藏集 URL 80 | :param str destination: 图片保存到本地的路径,默认当前路径 81 | :param int max_pages: 最大爬取页数,默认所有 82 | :param list spec_topics: 需要下载的特定主题 83 | :param int max_topics: 最大下载主题数量,默认所有 84 | :param int max_workers: 线程开启个数,默认 20 85 | :param int retries: 请求异常时的重试次数,默认 3 86 | :param str redownload: 下载记录文件,给定此文件则从失败记录进行下载 87 | :param bool overwrite: 是否覆盖已存在的文件,默认 False 88 | :param bool thumbnail: 是否下载缩略图,默认 False 89 | """ 90 | self.start_time = datetime.now() 91 | print(f' - - - - - -+-+ {self.start_time.ctime()} +-+- - - - - -\n') 92 | self.collection = collection 93 | self.spec_topics = spec_topics 94 | self.max_topics = max_topics or 'all' 95 | self.max_workers = max_workers or MAX_WORKERS 96 | self.pool = ThreadPoolExecutor(self.max_workers) 97 | self.overwrite = overwrite 98 | self.thumbnail = thumbnail 99 | self.pages = Queue() 100 | self.topics = Queue() 101 | self.images = Queue() 102 | self.stat = { 103 | 'npages': 0, 104 | 'ntopics': 0, 105 | 'nimages': 0, 106 | 'pages_pass': set(), 107 | 'pages_fail': set(), 108 | 'topics_pass': set(), 109 | 'topics_fail': set(), 110 | 'images_pass': set(), 111 | 'images_fail': set() 112 | } 113 | 114 | if retries: 115 | # 重置全局变量 RETRIES 116 | global RETRIES 117 | RETRIES = retries 118 | 119 | dest = Path(destination or '', urlparse(HOST_PAGE).netloc) 120 | 121 | # 从记录文件中的失败项开始下载 122 | if redownload: 123 | self.username = self.reload_records(redownload) 124 | self.user_id = self.search_id_by_username(self.username) 125 | self.max_pages = self.pages.qsize() 126 | self.max_topics = self.topics.qsize() 127 | self.directory = dest / safe_filename(self.username) 128 | self.stat.update({ 129 | 'npages': self.max_pages, 130 | 'ntopics': self.max_topics, 131 | 'nimages': self.images.qsize() 132 | }) 133 | print(f'{"Username".rjust(17)}: {colored(self.username, "cyan")}\n' 134 | f'{"User ID".rjust(17)}: {self.user_id}\n' 135 | f'{"Pages to scrapy".rjust(17)}: {self.max_pages:2d}\n' 136 | f'{"Topics to scrapy".rjust(17)}: {self.max_topics:3d}\n' 137 | f'{"Images to scrapy".rjust(17)}: {self.images.qsize():4d}\n' 138 | f'Storage directory: {colored(self.directory, attrs=["underline"])}', end='\n\n') 139 | self.fetch_all(initialized=True) 140 | return 141 | 142 | # 从收藏集下载 143 | if collection: 144 | objid = self.parse_objid(collection, is_collection=True) 145 | resp = session_request(urljoin(HOST_PAGE, COLLECTION_SUFFIX.format(objid=objid, page=1))) 146 | data = resp.json().get('data', {}) 147 | total = data.get('total', 0) 148 | page_size = data.get('pageable', {}).get('pageSize') 149 | max_pages_ = math.ceil(total / page_size) 150 | self.max_pages = min(max_pages or 9999, max_pages_) 151 | self.directory = dest / safe_filename(f'{self.username}-{self._collection_name}') 152 | self.parse_collection_topics(data.get('content')) 153 | 154 | # 解析第 2 页 至 最大页的 topic 到下载任务 155 | for page in range(2, self.max_pages + 1): 156 | resp = session_request(urljoin(HOST_PAGE, COLLECTION_SUFFIX.format(objid=objid, page=page))) 157 | self.parse_collection_topics(topics=resp.json().get('data', {}).get('content'), 158 | offset=page_size * (page - 1)) 159 | 160 | # 根据用户 ID 或用户名下载 161 | else: 162 | self.user_id = user_id or self.search_id_by_username(username) 163 | self.base_url = urljoin(HOST_PAGE, USER_SUFFIX.format(id=self.user_id)) 164 | 165 | try: 166 | response = session_request(self.base_url) 167 | except requests.exceptions.ProxyError: 168 | cprint('Cannot connect to proxy.', 'red') 169 | sys.exit(1) 170 | except Exception as e: 171 | cprint(f'Failed to connect to {self.base_url}, {e}', 'red') 172 | sys.exit(1) 173 | 174 | soup = BeautifulSoup(markup=response.text, features='html.parser') 175 | try: 176 | author = soup.find(name='div', id='body').get('data-name') 177 | if username and username != author: 178 | cprint(f'Invalid user id:「{user_id}」or username:「{username}」!', 'red') 179 | sys.exit(1) 180 | self.username = author 181 | except Exception: 182 | self.username = username or 'anonymous' 183 | self.directory = dest / safe_filename(self.username) 184 | try: 185 | max_pages_ = int(soup.find(id='laypage_0').find_all(name='a')[-2].text) 186 | except Exception: 187 | max_pages_ = 1 188 | self.max_pages = min(max_pages or 9999, max_pages_) 189 | 190 | if self.spec_topics: 191 | topics = ', '.join(self.spec_topics) 192 | elif self.max_topics == 'all': 193 | topics = 'all' 194 | else: 195 | topics = self.max_pages * self.max_topics 196 | print(f'{"Username".rjust(17)}: {colored(self.username, "cyan")}\n' 197 | f'{"User ID".rjust(17)}: {self.user_id}\n' 198 | f'{"Maximum pages".rjust(17)}: {max_pages_}\n' 199 | f'{"Pages to scrapy".rjust(17)}: {self.max_pages}\n' 200 | f'{"Topics to scrapy".rjust(17)}: {topics}\n' 201 | f'Storage directory: {colored(self.directory, attrs=["underline"])}', end='\n\n') 202 | 203 | self.END_PARSING_TOPICS = False 204 | self.fetch_all(initialized=True if self.collection else False) 205 | 206 | def search_id_by_username(self, username): 207 | """通过用户昵称查找用户 ID。 208 | 209 | :param str username: 用户昵称 210 | :return int: 用户 ID 211 | """ 212 | if not username: 213 | cprint('Must give an or !', 'yellow') 214 | sys.exit(1) 215 | 216 | search_url = urljoin(HOST_PAGE, SEARCH_DESIGNER_SUFFIX.format(word=username)) 217 | try: 218 | response = session_request(search_url) 219 | except requests.exceptions.ProxyError: 220 | cprint('Cannot connect to proxy.', 'red') 221 | sys.exit(1) 222 | except Exception as e: 223 | cprint(f'Failed to connect to {search_url}, {e}', 'red') 224 | sys.exit(1) 225 | 226 | author_1st = BeautifulSoup(response.text, 'html.parser').find(name='div', class_='author-info') 227 | if (not author_1st) or (author_1st.get('data-name') != username): 228 | cprint(f'Username「{username}」does not exist!', 'yellow') 229 | sys.exit(1) 230 | 231 | return author_1st.get('data-id') 232 | 233 | def reload_records(self, file): 234 | """从本地下载记录里读取下载失败的内容。 235 | 236 | :param str file: 下载记录文件的路径。 237 | :return str: 用户名 238 | """ 239 | with open(file, 'r', encoding='utf-8') as f: 240 | for fail in json.loads(f.read()).get('fail'): 241 | scrapy = Scrapy._make(fail.values()) 242 | if scrapy.type == 'page': 243 | self.pages.put(scrapy) 244 | elif scrapy.type == 'topic': 245 | self.topics.put(scrapy) 246 | elif scrapy.type == 'image': 247 | self.images.put(scrapy) 248 | return scrapy.author 249 | 250 | def generate_pages(self): 251 | """根据最大下载页数,生成需要爬取主页的任务。""" 252 | for page in range(1, self.max_pages + 1): 253 | suffix = COLLECTION_SUFFIX if self.collection else PAGE_SUFFIX 254 | url = urljoin(self.base_url, suffix.format(page=page)) 255 | scrapy = Scrapy(type='page', author=self.username, title=page, 256 | objid=None, index=page - 1, url=url) 257 | if scrapy not in self.stat["pages_pass"]: 258 | self.pages.put(scrapy) 259 | 260 | def parse_collection_topics(self, topics: List[dict], offset: int = 0): 261 | for idx, topic in enumerate(topics): 262 | new_scrapy = Scrapy(type='topic', 263 | author=topic.get('creatorObj', {}).get('username'), 264 | title=topic.get('title'), 265 | objid=topic.get('id'), 266 | index=offset + idx, 267 | url=topic.get('pageUrl')) 268 | if new_scrapy not in self.stat["topics_pass"]: 269 | self.topics.put(new_scrapy) 270 | self.stat["ntopics"] += 1 271 | 272 | def parse_topics(self, scrapy): 273 | """爬取主页,解析所有 topic,并将爬取主题的任务添加到任务队列。 274 | 275 | :param scrapy: 记录任务信息的数据体 276 | :return Scrapy: 记录任务信息的数据体 277 | """ 278 | resp = session_request(scrapy.url) 279 | cards = BeautifulSoup(resp.text, 'html.parser').find_all(name='a', class_='card-img-hover') 280 | for idx, card in enumerate(cards if self.max_topics == 'all' else cards[:self.max_topics + 1]): 281 | title = card.get('title') 282 | if self.spec_topics and (title not in self.spec_topics): 283 | continue 284 | 285 | new_scrapy = Scrapy(type='topic', author=scrapy.author, title=title, 286 | objid=None, index=idx, url=card.get('href')) 287 | if new_scrapy not in self.stat["topics_pass"]: 288 | self.topics.put(new_scrapy) 289 | self.stat["ntopics"] += 1 290 | return scrapy 291 | 292 | def fetch_topics(self): 293 | """从任务队列中获取要爬取的主页,使用多线程处理得到需要爬取的主题。""" 294 | page_futures = {} 295 | while True: 296 | try: 297 | scrapy = self.pages.get(timeout=Q_TIMEOUT) 298 | page_futures[self.pool.submit(self.parse_topics, scrapy)] = scrapy 299 | except Empty: 300 | break 301 | except Exception: 302 | continue 303 | 304 | for future in as_completed(page_futures): 305 | scrapy = page_futures.get(future) 306 | try: 307 | future.result() 308 | self.stat["pages_pass"].add(scrapy) 309 | except Exception: 310 | self.stat["pages_fail"].add(scrapy) 311 | cprint(f'GET page: {scrapy.title} ({scrapy.url}) failed.', 'red') 312 | self.END_PARSING_TOPICS = True 313 | 314 | def parse_objid(self, url: str, is_collection: bool = False) -> str: 315 | """根据 topic 页面解析 objid 316 | 317 | :param url: topic 或 collection 的 URL 318 | :return: objid 319 | """ 320 | soup = BeautifulSoup(session_request(url).text, 'html.parser') 321 | objid = soup.find('input', id='dataInput').attrs.get('data-objid') 322 | if is_collection: 323 | self._collection_name = soup.find('h2', class_='title-h2').text 324 | user = soup.find(name='span', class_='details-user-avatar') 325 | self.user_id = user.find('div').attrs.get('data-id') 326 | self.username = user.find('a').attrs.get('title') 327 | return objid 328 | 329 | def parse_images(self, scrapy): 330 | """爬取 topic,获得 objid 后直接调用 API,从返回数据里获得图片地址等信息, 331 | 332 | 并将下载图片的任务添加到任务队列。 333 | :param scrapy: 记录任务信息的数据体 334 | :return Scrapy: 记录任务信息的数据体 335 | """ 336 | objid = scrapy.objid or self.parse_objid(scrapy.url) 337 | resp = session_request(urljoin(HOST_PAGE, WORK_SUFFIX.format(objid=objid))) 338 | data = resp.json().get('data', {}) 339 | author = data.get('product', {}).get('creatorObj', {}).get('username') 340 | title = data.get('product', {}).get('title') 341 | objid = data.get('product', {}).get('id') 342 | 343 | for img in data.get('allImageList', []): 344 | new_scrapy = Scrapy(type='image', author=author, title=title, 345 | objid=objid, index=img.get('orderNo') or 0, url=img.get('url')) 346 | if new_scrapy not in self.stat["images_pass"]: 347 | self.images.put(new_scrapy) 348 | self.stat["nimages"] += 1 349 | return scrapy 350 | 351 | def fetch_images(self): 352 | """从任务队列中获取要爬取的主题,使用多线程处理得到需要下载的图片。""" 353 | image_futures = {} 354 | while True: 355 | try: 356 | scrapy = self.topics.get(timeout=Q_TIMEOUT) 357 | image_futures[self.pool.submit(self.parse_images, scrapy)] = scrapy 358 | except Empty: 359 | if self.END_PARSING_TOPICS: 360 | break 361 | except Exception: 362 | continue 363 | 364 | for future in as_completed(image_futures): 365 | scrapy = image_futures.get(future) 366 | try: 367 | future.result() 368 | self.stat["topics_pass"].add(scrapy) 369 | except Exception: 370 | self.stat["topics_fail"].add(scrapy) 371 | cprint(f'GET topic: {scrapy.title} ({scrapy.url}) failed.', 'red') 372 | 373 | def fetch_all(self, initialized: bool = False): 374 | """同时爬取主页、主题,并更新状态。""" 375 | if not initialized: 376 | self.generate_pages() 377 | fetch_futures = [self.pool.submit(self.fetch_topics), 378 | self.pool.submit(self.fetch_images)] 379 | end_show_fetch = False 380 | t = threading.Thread(target=self.show_fetch_status, kwargs={'end': lambda: end_show_fetch}) 381 | t.start() 382 | try: 383 | wait(fetch_futures) 384 | except KeyboardInterrupt: 385 | raise 386 | finally: 387 | end_show_fetch = True 388 | t.join() 389 | 390 | def show_fetch_status(self, interval=0.5, end=None): 391 | """用于后台线程,实现边爬取边显示状态。 392 | 393 | :param int interval: 状态更新间隔,秒 394 | :param function end: 用于控制退出线程 395 | """ 396 | while True: 397 | status = 'Fetched Pages: {pages}\tTopics: {topics}\tImages: {images}'.format( 398 | pages=colored(str(self.max_pages).rjust(3), 'blue'), 399 | topics=colored(str(self.stat["ntopics"]).rjust(3), 'blue'), 400 | images=colored(str(self.stat["nimages"]).rjust(5), 'blue')) 401 | print(status, end='\r', flush=True) 402 | if (interval == 0) or (end and end()): 403 | print('\n') 404 | break 405 | time.sleep(interval) 406 | 407 | def show_download_status(self, interval=0.5, end=None): 408 | """用于后台线程,实现边下载边显示状态。 409 | 410 | :param int interval: 状态更新间隔,秒 411 | :param function end: 用于控制退出线程 412 | """ 413 | while True: 414 | completed = len(self.stat["images_pass"]) + len(self.stat["images_fail"]) 415 | if self.stat["nimages"] > 0: 416 | status = 'Time used: {time_used}\tFailed: {failed}\tCompleted: {completed}'.format( 417 | time_used=colored(str(datetime.now() - self.start_time)[:-7], 'yellow'), 418 | failed=colored(str(len(self.stat["images_fail"])).rjust(3), 'red'), 419 | completed=colored(str(int(completed / self.stat["nimages"] * 100)) 420 | + f'% ({completed}/{self.stat["nimages"]})', 'green')) 421 | print(status, end='\r', flush=True) 422 | if (interval == 0) or (end and end()): 423 | if self.stat["nimages"] > 0: 424 | print('\n') 425 | break 426 | time.sleep(interval) 427 | 428 | def download_image(self, scrapy): 429 | """下载图片保存到本地。 430 | 431 | :param scrapy: 记录任务信息的数据体 432 | :return Scrapy: 记录任务信息的数据体 433 | """ 434 | try: 435 | name = re.findall(r'(?<=/)\w*?\.(?:jpg|gif|png|bmp)', scrapy.url, re.IGNORECASE)[0] 436 | except IndexError: 437 | name = uuid4().hex + '.jpg' 438 | 439 | path = self.directory / safe_filename(scrapy.title) 440 | filename = path / f'[{scrapy.index + 1 or 0:02d}]{name}' 441 | if (not self.overwrite) and op.isfile(filename): 442 | return scrapy 443 | 444 | url = scrapy.url 445 | if self.thumbnail: 446 | if url.lower().endswith(('jpg', 'png', 'bmp')): 447 | url = f'{scrapy.url}@1280w_1l_2o_100sh.{url[-3:]}' 448 | resp = session_request(url) 449 | 450 | mkdirs_if_not_exist(path) 451 | with open(filename, 'wb') as f: 452 | for chunk in resp.iter_content(8192): 453 | f.write(chunk) 454 | return scrapy 455 | 456 | def save_records(self): 457 | """将成功及失败的下载记录保存到本地文件。 458 | 459 | :return str: 记录文件的路径 460 | """ 461 | filename = f'{safe_filename(self.start_time.isoformat()[:-7])}.json' 462 | abspath = op.abspath(self.directory / filename) 463 | with open(abspath, 'w', encoding='utf-8') as f: 464 | success = (self.stat["pages_pass"] | self.stat["topics_pass"] | self.stat["images_pass"]) 465 | fail = (self.stat["pages_fail"] | self.stat["topics_fail"] | self.stat["images_fail"]) 466 | type_order = {'page': 1, 'topic': 2, 'image': 3} 467 | s_ordered = sort_records(success, order=type_order) 468 | f_ordered = sort_records(fail, order=type_order) 469 | 470 | records = { 471 | 'time': self.start_time.isoformat(), 472 | 'success': [scrapy._asdict() for scrapy in s_ordered], 473 | 'fail': [scrapy._asdict() for scrapy in f_ordered] 474 | } 475 | f.write(json.dumps(records, ensure_ascii=False, indent=2)) 476 | return abspath 477 | 478 | def run_scraper(self): 479 | """使用多线程下载所有图片,完成后保存记录并退出程序。""" 480 | end_show_download = False 481 | t = threading.Thread(target=self.show_download_status, kwargs={'end': lambda: end_show_download}) 482 | t.start() 483 | 484 | image_futuress = {} 485 | while True: 486 | try: 487 | scrapy = self.images.get_nowait() 488 | if scrapy not in self.stat["images_pass"]: 489 | image_futuress[self.pool.submit(self.download_image, scrapy)] = scrapy 490 | except Empty: 491 | break 492 | except KeyboardInterrupt: 493 | raise 494 | except Exception: 495 | continue 496 | 497 | try: 498 | for future in as_completed(image_futuress): 499 | scrapy = image_futuress.get(future) 500 | try: 501 | future.result() 502 | self.stat["images_pass"].add(scrapy) 503 | except Exception: 504 | self.stat["images_fail"].add(scrapy) 505 | cprint(f'Download image: {scrapy.title}[{scrapy.index + 1}] ' 506 | f'({scrapy.url}) failed.', 'red') 507 | except KeyboardInterrupt: 508 | raise 509 | finally: 510 | end_show_download = True 511 | t.join() 512 | 513 | saved_images = len(self.stat["images_pass"]) 514 | failed_images = len(self.stat["images_fail"]) 515 | if saved_images or failed_images: 516 | if saved_images: 517 | print(f'Saved {colored(saved_images, "green")} images to ' 518 | f'{colored(self.directory.absolute(), attrs=["underline"])}') 519 | records_path = self.save_records() 520 | print(f'Saved records to {colored(records_path, attrs=["underline"])}') 521 | else: 522 | cprint('No images to download.', 'yellow') 523 | 524 | 525 | @click.command() 526 | @click.option('-u', '--usernames', 'names', help='One or more user names, separated by commas.') 527 | @click.option('-i', '--ids', 'ids', help='One or more user IDs, separated by commas.') 528 | @click.option('-c', '--collections', 'collections', help='One or more collection URLs, separated by commas.') 529 | @click.option('-t', '--topics', 'topics', help='Specific topics to download, separated by commas.') 530 | @click.option('-d', '--destination', 'destination', help='Destination to save images.') 531 | @click.option('-R', '--retries', 'retries', default=RETRIES, show_default=True, type=int, 532 | help='Repeat download for failed images.') 533 | @click.option('-r', '--redownload', 'redownload', 534 | help='Redownload images from failed records (PATH of the .json file).') 535 | @click.option('-o', '--overwrite', 'overwrite', is_flag=True, default=False, help='Override the existing files.') 536 | @click.option('--thumbnail', 'thumbnail', is_flag=True, default=False, 537 | help='Download thumbnails with a maximum width of 1280px.') 538 | @click.option('--max-pages', 'max_pages', type=int, help='Maximum pages to download.') 539 | @click.option('--max-topics', 'max_topics', type=int, help='Maximum topics per page to download.') 540 | @click.option('--max-workers', 'max_workers', default=MAX_WORKERS, show_default=True, type=int, 541 | help='Maximum thread workers.') 542 | def zcool_command(ids, names, collections, destination, max_pages, topics, max_topics, 543 | max_workers, retries, redownload, overwrite, thumbnail): 544 | """ZCool picture crawler, download pictures, photos and illustrations of 545 | ZCool (https://zcool.com.cn/). Visit https://github.com/lonsty/scraper. 546 | """ 547 | if redownload: 548 | scraper = ZCoolScraper(destination=destination, max_pages=max_pages, spec_topics=topics, 549 | max_topics=max_topics, max_workers=max_workers, retries=retries, 550 | redownload=redownload, overwrite=overwrite, thumbnail=thumbnail) 551 | scraper.run_scraper() 552 | 553 | elif any([ids, names, collections]): 554 | topics = topics.split(',') if topics else [] 555 | resources = parse_resources(ids, names, collections) 556 | for res in resources: 557 | scraper = ZCoolScraper(user_id=res.id, username=res.name, collection=res.collection, 558 | destination=destination, max_pages=max_pages, spec_topics=topics, 559 | max_topics=max_topics, max_workers=max_workers, retries=retries, 560 | redownload=redownload, overwrite=overwrite) 561 | scraper.run_scraper() 562 | 563 | else: 564 | click.echo('Try "python zcool.py --help" for help.') 565 | return 1 566 | return 0 567 | --------------------------------------------------------------------------------