├── _config.yml
├── screenshots
    ├── 01.png
    ├── 02.png
    ├── 03.png
    ├── 04.png
    └── 05.png
├── tests
    ├── __init__.py
    ├── test_zcool.py
    └── test_multi_thread.py
├── requirements.txt
├── zcool.py
├── cnu.py
├── scraper
    ├── __init__.py
    ├── utils.py
    ├── cnu.py
    └── zcool.py
├── Pipfile
├── LICENSE
├── .gitignore
├── README.md
└── Pipfile.lock


/_config.yml:
--------------------------------------------------------------------------------
1 | theme: jekyll-theme-minimal


--------------------------------------------------------------------------------
/screenshots/01.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lonsty/scraper/HEAD/screenshots/01.png


--------------------------------------------------------------------------------
/screenshots/02.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lonsty/scraper/HEAD/screenshots/02.png


--------------------------------------------------------------------------------
/screenshots/03.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lonsty/scraper/HEAD/screenshots/03.png


--------------------------------------------------------------------------------
/screenshots/04.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lonsty/scraper/HEAD/screenshots/04.png


--------------------------------------------------------------------------------
/screenshots/05.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lonsty/scraper/HEAD/screenshots/05.png


--------------------------------------------------------------------------------
/tests/__init__.py:
--------------------------------------------------------------------------------
1 | # @FILENAME : __init__.py
2 | # @AUTHOR : lonsty
3 | # @DATE : 2019/9/9 8:46
4 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | aiofiles>=0.6.0
2 | beautifulsoup4>=4.9.1
3 | click>=7.0
4 | requests>=2.24.0
5 | ruia[uvloop]>=0.8.0
6 | termcolor>=1.1.0
7 | typer>=0.3.2
8 | 


--------------------------------------------------------------------------------
/zcool.py:
--------------------------------------------------------------------------------
 1 | # @FILENAME : zcool
 2 | # @AUTHOR : lonsty
 3 | # @DATE : 2019/9/9 11:19
 4 | import sys
 5 | 
 6 | from scraper import zcool_command
 7 | 
 8 | if __name__ == '__main__':
 9 |     sys.exit(zcool_command())
10 | 


--------------------------------------------------------------------------------
/cnu.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | # @Author: eilianxiao
 3 | # @Date: Dec 27 03:59 2020
 4 | import typer
 5 | 
 6 | from scraper.cnu import cnu_command
 7 | 
 8 | if __name__ == '__main__':
 9 |     typer.run(cnu_command)
10 | 


--------------------------------------------------------------------------------
/scraper/__init__.py:
--------------------------------------------------------------------------------
 1 | # @FILENAME : __init__.py
 2 | # @AUTHOR : lonsty
 3 | # @DATE : 2019/9/9 11:04
 4 | from .zcool import ZCoolScraper, zcool_command
 5 | 
 6 | __author__ = 'lonsty'
 7 | __email__ = 'lonsty@sina.com'
 8 | __version__ = '0.1.4'
 9 | 
10 | __all__ = [
11 |     'ZCoolScraper',
12 |     'zcool_command'
13 | ]
14 | 


--------------------------------------------------------------------------------
/Pipfile:
--------------------------------------------------------------------------------
 1 | [[source]]
 2 | name = "pypi"
 3 | url = "https://pypi.org/simple"
 4 | verify_ssl = true
 5 | 
 6 | [dev-packages]
 7 | 
 8 | [packages]
 9 | aiofiles = ">=0.6.0"
10 | beautifulsoup4 = ">=4.9.1"
11 | click = ">=7.0"
12 | requests = ">=2.24.0"
13 | ruia = {version = ">=0.8.0", extras = ["uvloop"]}
14 | termcolor = ">=1.1.0"
15 | typer = ">=0.3.2"
16 | 
17 | [requires]
18 | python_version = "3.6"
19 | 


--------------------------------------------------------------------------------
/tests/test_zcool.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | """Tests for `zcooldl` package."""
 3 | import unittest
 4 | 
 5 | from click.testing import CliRunner
 6 | 
 7 | from scraper.zcool import zcool_command
 8 | 
 9 | 
10 | class TestZcooldl(unittest.TestCase):
11 |     """Tests for `zcooldl` package."""
12 | 
13 |     def setUp(self):
14 |         """Set up test fixtures, if any."""
15 | 
16 |     def tearDown(self):
17 |         """Tear down test fixtures, if any."""
18 | 
19 |     def test_000_something(self):
20 |         """Test something."""
21 | 
22 |     def test_command_line_interface(self):
23 |         """Test the CLI."""
24 |         runner = CliRunner()
25 |         result = runner.invoke(zcool_command)
26 |         assert result.exit_code == 0
27 |         assert 'Try "python zcool.py --help" for help.' in result.output
28 |         help_result = runner.invoke(zcool_command, ['--help'])
29 |         assert help_result.exit_code == 0
30 |         assert 'Show this message and exit.' in help_result.output
31 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2019 Allen Shaw
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | build/
 12 | develop-eggs/
 13 | dist/
 14 | downloads/
 15 | eggs/
 16 | .eggs/
 17 | lib/
 18 | lib64/
 19 | parts/
 20 | sdist/
 21 | var/
 22 | wheels/
 23 | *.egg-info/
 24 | .installed.cfg
 25 | *.egg
 26 | MANIFEST
 27 | 
 28 | # PyInstaller
 29 | #  Usually these files are written by a python script from a template
 30 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 31 | *.manifest
 32 | *.spec
 33 | 
 34 | # Installer logs
 35 | pip-log.txt
 36 | pip-delete-this-directory.txt
 37 | 
 38 | # Unit test / coverage reports
 39 | htmlcov/
 40 | .tox/
 41 | .coverage
 42 | .coverage.*
 43 | .cache
 44 | nosetests.xml
 45 | coverage.xml
 46 | *.cover
 47 | .hypothesis/
 48 | .pytest_cache/
 49 | 
 50 | # Translations
 51 | *.mo
 52 | *.pot
 53 | 
 54 | # Django stuff:
 55 | *.log
 56 | local_settings.py
 57 | db.sqlite3
 58 | 
 59 | # Flask stuff:
 60 | instance/
 61 | .webassets-cache
 62 | 
 63 | # Scrapy stuff:
 64 | .scrapy
 65 | 
 66 | # Sphinx documentation
 67 | docs/_build/
 68 | 
 69 | # PyBuilder
 70 | target/
 71 | 
 72 | # Jupyter Notebook
 73 | .ipynb_checkpoints
 74 | 
 75 | # pyenv
 76 | .python-version
 77 | 
 78 | # celery beat schedule file
 79 | celerybeat-schedule
 80 | 
 81 | # SageMath parsed files
 82 | *.sage.py
 83 | 
 84 | # Environments
 85 | .env
 86 | .venv
 87 | env/
 88 | venv/
 89 | ENV/
 90 | env.bak/
 91 | venv.bak/
 92 | 
 93 | # Spyder project settings
 94 | .spyderproject
 95 | .spyproject
 96 | 
 97 | # Rope project settings
 98 | .ropeproject
 99 | 
100 | # mkdocs documentation
101 | /site
102 | 
103 | # mypy
104 | .mypy_cache/
105 | 
106 | .idea
107 | www.zcool.com.cn/
108 | www.cnu.cc/
109 | 


--------------------------------------------------------------------------------
/scraper/utils.py:
--------------------------------------------------------------------------------
 1 | # @FILENAME : utils
 2 | # @AUTHOR : lonsty
 3 | # @DATE : 2019/9/9 11:09
 4 | import os
 5 | import random
 6 | import time
 7 | from collections import namedtuple
 8 | from functools import wraps
 9 | from typing import Iterable
10 | 
11 | 
12 | def retry(exceptions, tries=3, delay=1, backoff=2, logger=None):
13 |     """Retry calling the decorated function using an exponential backoff.
14 | 
15 |     :param exceptions: The exception to check. may be a tuple of exceptions to check.
16 |     :param tries: Number of times to try (not retry) before giving up.
17 |     :param delay: Initial delay between retries in seconds.
18 |     :param backoff: Backoff multiplier (e.g. value of 2 will double the delay each retry).
19 |     :param logger: Logger to use. If None, print.
20 |     """
21 | 
22 |     def deco_retry(f):
23 | 
24 |         @wraps(f)
25 |         def f_retry(*args, **kwargs):
26 |             mtries, mdelay = tries, delay or random.uniform(0.5, 1.5)
27 |             while mtries > 1:
28 |                 try:
29 |                     return f(*args, **kwargs)
30 |                 except exceptions as e:
31 |                     if logger:
32 |                         logger.warning('{}, Retrying in {} seconds...'.format(e, mdelay))
33 |                     # else:
34 |                     #     print('{}, Retrying in {} seconds...'.format(e, mdelay))
35 |                     time.sleep(mdelay)
36 |                     mtries -= 1
37 |                     mdelay *= backoff
38 |             return f(*args, **kwargs)
39 | 
40 |         return f_retry  # true decorator
41 | 
42 |     return deco_retry
43 | 
44 | 
45 | def mkdirs_if_not_exist(dir):
46 |     """文件夹不存在时则创建。
47 | 
48 |     :param str dir: 文件夹路径，支持多级
49 |     """
50 |     if not os.path.isdir(dir):
51 |         try:
52 |             os.makedirs(dir)
53 |             return True
54 |         except FileExistsError:
55 |             pass
56 | 
57 | 
58 | def safe_filename(filename):
59 |     """去掉文件名中的非法字符。
60 | 
61 |     :param str filename: 文件名
62 |     :return str: 合法文件名
63 |     """
64 |     return "".join([c for c in filename if c not in r'\/:*?"<>|']).strip()
65 | 
66 | 
67 | def parse_resources(ids, names, collections):
68 |     """解析用户名或 ID。
69 | 
70 |     :param str ids: 半角逗号分隔的用户 ID
71 |     :param str names: 半角逗号分隔的用户名
72 |     :return list: 包含 User 数据的列表
73 |     """
74 |     Resource = namedtuple('Resource', 'id name collection')
75 |     resources = []
76 |     if collections:
77 |         resources = [Resource(None, None, collection) for collection in collections.split(',')]
78 |     elif names:
79 |         resources = [Resource(None, name, None) for name in names.split(',')]
80 |     elif ids:
81 |         resources = [Resource(uid, None, None) for uid in ids.split(',')]
82 |     return resources  # TODO: 去重
83 | 
84 | 
85 | def sort_records(records: Iterable, order: dict):
86 |     """根据自定义的排序规则排序
87 | 
88 |     :param Iterable records: 要排序的记录
89 |     :param dict order: 自定义的排序
90 |     :return:
91 |     """
92 | 
93 |     def _order_by(obj: namedtuple):
94 |         if obj.type == 'topic':
95 |             return (order[obj.type], obj.index, obj.objid, obj.title, obj.url)
96 |         return (order[obj.type], obj.objid, obj.index, obj.title, obj.url)
97 | 
98 |     return sorted(records, key=_order_by)
99 | 


--------------------------------------------------------------------------------
/tests/test_multi_thread.py:
--------------------------------------------------------------------------------
 1 | # @FILENAME : test_requests
 2 | # @AUTHOR : lonsty
 3 | # @DATE : 2020/12/27 14:44
 4 | import os
 5 | import threading
 6 | from concurrent.futures import ThreadPoolExecutor, wait
 7 | from pathlib import Path
 8 | import traceback
 9 | 
10 | import requests
11 | 
12 | # http://www.cnu.cc/works/427334
13 | # urls = [
14 | #     'http://imgoss.cnu.cc/2010/30/994dfe50c509344eb3f5b525df642d60.jpg',
15 | #     'http://imgoss.cnu.cc/2010/30/4e695801d1c83c559991017cb8e2ff7a.jpg',
16 | #     'http://imgoss.cnu.cc/2010/30/07c3089cf6383bf596c59e92dc105412.jpg',
17 | #     'http://imgoss.cnu.cc/2010/30/a5b738e0f21737bb953e7407c8425772.jpg',
18 | #     'http://imgoss.cnu.cc/2010/30/24dd6e27c9cf38eab7daa72cc3e182e3.jpg',
19 | #     'http://imgoss.cnu.cc/2010/30/624cbd2e69313174b6ad13a9d2e75279.jpg',
20 | #     'http://imgoss.cnu.cc/2010/30/9786475b74733c91ba7f8b638468e299.jpg',
21 | #     'http://imgoss.cnu.cc/2010/30/75e085d6573c326dab3a8959736cb355.jpg',
22 | #     'http://imgoss.cnu.cc/2010/30/d28d0a7729353414a58e8fa10ff11fe8.jpg',
23 | #     'http://imgoss.cnu.cc/2010/30/669be1298a3e3546974977ed7c9655eb.jpg',
24 | #     'http://imgoss.cnu.cc/2010/30/ee82358fd6a537108d938ebba7cfe7de.jpg',
25 | #     'http://imgoss.cnu.cc/2010/30/007ce903478e3f6aa867a5789d213748.jpg',
26 | #     'http://imgoss.cnu.cc/2010/30/49549ca620873ba5bf71dd8c92d9e006.jpg',
27 | #     'http://imgoss.cnu.cc/2010/30/11426afdb0453d1b86bc7a2bb187bcef.jpg',
28 | #     'http://imgoss.cnu.cc/2010/30/39b4116ba53731f3994acddef431532c.jpg',
29 | #     'http://imgoss.cnu.cc/2010/30/7a7b6ae21eb13701a8f238aaedcf6d7d.jpg',
30 | #     'http://imgoss.cnu.cc/2010/30/e9ed54e368873d44b6b8fc276f0a6018.jpg',
31 | #     'http://imgoss.cnu.cc/2010/30/5f97c9f5ff6f353f814be1f9867ba6d0.jpg',
32 | #     'http://imgoss.cnu.cc/2010/30/42fb0868ca8434ec8c5f3460b96b2b5b.jpg',
33 | #     'http://imgoss.cnu.cc/2010/30/584e84b9e0253c80ac33f22a33adb9df.jpg'
34 | # ]
35 | 
36 | # http://www.cnu.cc/works/435640
37 | urls = [
38 |     'http://imgoss.cnu.cc/2012/25/pv6kqgjspuf4e9mefu01608863905044.jpg',
39 |     'http://imgoss.cnu.cc/2012/25/tpl70xav4fk8zsu55881608863905046.jpg',
40 |     'http://imgoss.cnu.cc/2012/25/9mkp309it3ub5qv9f6f1608863905046.jpg',
41 |     'http://imgoss.cnu.cc/2012/25/zb7f9qlpu75x2s53i821608863905047.jpg',
42 |     'http://imgoss.cnu.cc/2012/25/r2kxv11qltnruneqlpk1608863905047.jpg',
43 |     'http://imgoss.cnu.cc/2012/25/oqr644pxdcxeb404n1e1608863905048.jpg',
44 |     'http://imgoss.cnu.cc/2012/25/vjgt0am668sus1kvvcj1608863905048.jpg',
45 |     'http://imgoss.cnu.cc/2012/25/wjchu6v3en8iin2x3qf1608863905049.jpg',
46 |     'http://imgoss.cnu.cc/2012/25/jnq983zvv9k6iatdofo1608863905049.jpg'
47 | ]
48 | thread_local = threading.local()
49 | dest = Path('www.cnu.cc/冬日暖阳')
50 | 
51 | 
52 | def mkdirs_if_not_exist(dir):
53 |     """文件夹不存在时则创建。
54 | 
55 |     :param str dir: 文件夹路径，支持多级
56 |     """
57 |     if not os.path.isdir(dir):
58 |         try:
59 |             os.makedirs(dir)
60 |             return True
61 |         except FileExistsError:
62 |             pass
63 | 
64 | 
65 | def get_session():
66 |     """使线程获取同一个 Session，可减少 TCP 连接数，加速请求。
67 | 
68 |     :return requests.Session: session
69 |     """
70 |     if not hasattr(thread_local, "session"):
71 |         thread_local.session = requests.Session()
72 |     return thread_local.session
73 | 
74 | 
75 | def download_image(url):
76 |     print(f'Downloading {url} ...')
77 |     session = get_session()
78 |     try:
79 |         response = session.get(url, timeout=20)
80 |     except Exception:
81 |         print(traceback.format_exc())
82 |         return
83 |     filepath = dest / url.split("/")[-1]
84 |     with open(filepath, 'wb') as f:
85 |         for chunk in response.iter_content(8192):
86 |             f.write(chunk)
87 |         print(f'Saved to {filepath}')
88 | 
89 | 
90 | if __name__ == '__main__':
91 |     print('Start ...')
92 |     os.makedirs(dest, exist_ok=True)
93 |     with ThreadPoolExecutor(max_workers=10) as pool:
94 |         futures = [pool.submit(download_image, url) for url in urls]
95 |     wait(futures)
96 |     print('Done.')
97 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # scraper
  2 | 
  3 | 图片爬取下载工具，极速爬取下载 站酷（[https://www.zcool.com.cn/](https://www.zcool.com.cn/)）、CNU 视觉（[http://www.cnu.cc/](http://www.cnu.cc/)）`设计师/用户` 上传的 `图片/照片/插画`。
  4 | 
  5 | **:tada: :tada: :tada: 站酷下载工具已发布到 PyPI**
  6 | 
  7 | - 项目地址：[https://github.com/lonsty/zcooldl](https://github.com/lonsty/zcooldl)
  8 | - 在线文档：[https://zcooldl.readthedocs.io/](https://zcooldl.readthedocs.io/)
  9 | - 快速安装：`pip install -U zcooldl`
 10 | - 使用方式：`zcooldl -u <username>`
 11 | 
 12 | `scraper` 本来是规划用来存放各式各样的爬虫程序的。站酷仅仅是当初构想中的一个，因为太懒而没有新增其他爬虫。
 13 | 想不到 [zcool.py](scraper/zcool.py) 竟然从原来的几十行代码，逐步增加到现在的 500+ 行 :joy: :joy: :joy:。
 14 | 
 15 | 
 16 | ## 支持网站：
 17 | 
 18 | | 网站 | 入口 | 示例 |
 19 | |:---:|:---:|:---:|
 20 | | [Zcool 站酷](https://www.zcool.com.cn/)  | [zcool.py](zcool.py) | `python zcool.py -u 叁乔居` |
 21 | | [CNU 视觉](http://www.cnu.cc/) | [cnu.py](cnu.py) | `python cnu.py http://www.cnu.cc/users/142231` |
 22 | 
 23 | 
 24 | #### Zcool 站酷
 25 | 
 26 | - [x] 极速下载：多线程异步下载，可以根据需要设置线程数
 27 | - [x] 超清原图：默认下载超清原图（约几 MB），使用参数 `--thumbnail` 下载缩略图（宽最大 1280px，约 500KB）
 28 | - [x] 下载收藏夹 `New`：使用 `-c <收藏夹 URL, ...>` 下载收藏夹中的作品（收藏夹可自由创建）
 29 | 
 30 | #### CNU 视觉
 31 | 
 32 | - [x] 下载 [视觉（CNU）](http://www.cnu.cc/) 作品 `New`：试用异步爬虫框架 [ruia](https://github.com/howie6879/ruia)
 33 | - [x] 支持 3 类 URL 参数：
 34 | 
 35 |     - 作品集：[http://www.cnu.cc/works/117783](http://www.cnu.cc/works/117783)
 36 |     - 用户作品页：[http://www.cnu.cc/users/142231](http://www.cnu.cc/users/652629)
 37 |     - 用户推荐页：[http://www.cnu.cc/users/recommended/142231](http://www.cnu.cc/users/recommended/652629)
 38 | 
 39 |     参数可接收多个 URL，且可随意组合，如：
 40 |     
 41 |     ```sh
 42 |     $ python cnu.py http://www.cnu.cc/works/117783 http://www.cnu.cc/users/652629 http://www.cnu.cc/users/recommended/652629
 43 |     ```
 44 | 
 45 | ### 环境：
 46 | 
 47 | - `python3.6` 及以上
 48 | 
 49 | # 快速使用
 50 | 
 51 | 首先克隆项目到本地，并安装依赖：
 52 | 
 53 | ```sh
 54 | $ git clone https://github.com/lonsty/scraper.git
 55 | 
 56 | $ cd scraper
 57 | $ pip install -r requirements.txt
 58 | ```
 59 | 
 60 | 1. 下载 [站酷（Zcool）](https://www.zcool.com.cn/)作品
 61 | 
 62 | 下载用户名为 `username` 的所有图片到路径 `path` 下：
 63 | 
 64 | ```sh
 65 | $ python zcool.py -u <username> -d <path>
 66 | ```
 67 | 
 68 | 运行截图
 69 | 
 70 | ![screenshot_04.png](screenshots/04.png)
 71 | 
 72 | ![screenshot_03.png](screenshots/03.png)
 73 | 
 74 | ![screenshot_05.png](screenshots/05.png)
 75 | 
 76 | 爬取结果
 77 | 
 78 | ![screenshot_02.png](screenshots/02.png)
 79 | 
 80 | 2. 下载 [视觉（CNU）](http://www.cnu.cc/)作品
 81 | 
 82 | ```sh
 83 | python cnu.py <WORK_URLS> ...
 84 | ```
 85 | 
 86 | # 使用帮助
 87 | 
 88 | ### 常用命令
 89 | 
 90 | 1. 只下载用户的**部分主题**
 91 | 
 92 | ```sh
 93 | $ python zcool.py -u <username> -t <topic1>,<topic2>,...
 94 | ```
 95 | 
 96 | 2. 一次性下载**多个用户**的所有图片
 97 | 
 98 | ```sh
 99 | $ python zcool.py -u <username1>,<username2>,...
100 | ```
101 | 
102 | 3. 部分图片**下载失败**或有**更新**，再执行相同的命令，对失败或新增的图片进行下载
103 | 
104 | ```sh
105 | $ python zcool.py -u <username> -d <last-saved-path>
106 | ```
107 | 
108 | ### 查看所有命令
109 | 
110 | ```sh
111 | # Zcool 站酷
112 | $ python zcool.py --help
113 | 
114 | Usage: zcool.py [OPTIONS]
115 | 
116 |   ZCool picture crawler, download pictures, photos and illustrations of
117 |   ZCool (https://zcool.com.cn/). Visit https://github.com/lonsty/scraper.
118 | 
119 | Options:
120 |   -u, --usernames TEXT    One or more user names, separated by commas.
121 |   -i, --ids TEXT          One or more user IDs, separated by commas.
122 |   -c, --collections TEXT  One or more collection URLs, separated by commas.
123 |   -t, --topics TEXT       Specific topics to download, separated by commas.
124 |   -d, --destination TEXT  Destination to save images.
125 |   -R, --retries INTEGER   Repeat download for failed images.  [default: 3]
126 |   -r, --redownload TEXT   Redownload images from failed records (PATH of the
127 |                           .json file).
128 |   -o, --overwrite         Override the existing files.
129 |   --thumbnail             Download thumbnails with a maximum width of 1280px.
130 |   --max-pages INTEGER     Maximum pages to download.
131 |   --max-topics INTEGER    Maximum topics per page to download.
132 |   --max-workers INTEGER   Maximum thread workers.  [default: 20]
133 |   --help                  Show this message and exit.
134 | 
135 | # CNU 视觉
136 | $ python cnu.py --help
137 | Usage: cnu.py [OPTIONS] START_URLS...
138 | 
139 |   A scraper to download images from http://www.cnu.cc/
140 | 
141 | Arguments:
142 |   START_URLS...  URLs of the works  [required]
143 | 
144 | Options:
145 |   -d, --destination PATH          Destination directory to save the images
146 |                                   [default: .]
147 | 
148 |   -o, --overwrite / -no, --no-overwrite
149 |                                   Whether to overwrite existing images
150 |                                   [default: False]
151 | 
152 |   -t, --thumbnail                 Whether to download the thumbnail images
153 |                                   [default: False]
154 | 
155 |   -r, --retries INTEGER           Number of retries when the download fails
156 |                                   [default: 3]
157 | 
158 |   -w, --workers INTEGER           Number of parallel workers  [default: 2]
159 |   -c, --concurrency INTEGER       Number of concurrency  [default: 25]
160 |   --delay INTEGER                 Seconds to wait for the next request
161 |                                   [default: 0]
162 | 
163 |   --retry-delay INTEGER           Seconds to wait for the retry request
164 |                                   [default: 0]
165 | 
166 |   --timeout INTEGER               Seconds of HTTP request timeout  [default:
167 |                                   20]
168 | 
169 |   --install-completion [bash|zsh|fish|powershell|pwsh]
170 |                                   Install completion for the specified shell.
171 |   --show-completion [bash|zsh|fish|powershell|pwsh]
172 |                                   Show completion for the specified shell, to
173 |                                   copy it or customize the installation.
174 | 
175 |   --help                          Show this message and exit.
176 | ```
177 | 
178 | # 更新历史
179 | 
180 | - ## 0.1.5 (2020-12-27)
181 | 
182 |     - 新增爬虫：使用 ruia 异步爬虫框架下载 [视觉（CNU）](http://www.cnu.cc/)作品
183 | 
184 | - ## 0.1.4 (2020-11-30)
185 | 
186 |     - 新增功能：新参数 `-c <收藏夹 URL, ...>`，支持下载收藏夹中的作品。
187 | 
188 | - ## 0.1.3 (2020-07-22)
189 | 
190 |     - 修复了在动态加载页面中无法获取并下载所有图片的问题
191 |     - 保存的图片文件名中加入了序号，以保持原始顺序
192 |     - 添加了注释，并对代码细节做了调整
193 | 
194 | - ## 2020.03.25
195 | 
196 |     - 优化了终端输出信息，用不同颜色文字进行了标识
197 |     - 修复了在低网速下无法下载图片的问题，并加快了整体下载速度
198 | 
199 | - ## 0.1.2 (2020-03-24)
200 | 
201 |     新功能：
202 |     
203 |     - 新增下载超清原图（默认选项，约几 MB），使用参数 `--thumbnail` 下载缩略图（宽最大 1280px，约 500KB）
204 |     - 新增支持下载 JPG、PNG、GIF、BMP 格式的图片
205 | 
206 | - ## 0.1.1 (2019-12-09)
207 | 
208 |     新功能：
209 |     
210 |     - 可以选择下载用户的特定主题
211 |     - 支持一次性输入多个用户名或 ID
212 |     
213 |     BUG 修复：
214 |     
215 |     - 修复用户如果没有上传任何图片时的下载错误
216 | 
217 | - ## 0.1.0 (2019-09-09)
218 | 
219 |     主要功能：
220 |     
221 |     - 极速下载：多线程异步下载，可以根据需要设置线程数
222 |     - 异常重试：只要重试次数足够多，就没有下载不下来的图片 \(^o^)/
223 |     - 增量下载：设计师/用户有新的上传，再跑一遍程序就行了 O(∩_∩)O 嗯！
224 |     - 支持代理：可以配置使用代理（0.1.3 版本后改为自动读取系统代理）
225 | 
226 | # LICENSE
227 | 
228 | 此项目使用 [MIT](LICENSE) 开源协议
229 | 
230 | **注意**：使用此工具下载的所有作品，版权归原作者所有，请谨慎使用！
231 | 


--------------------------------------------------------------------------------
/scraper/cnu.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # @Author: eilianxiao
  3 | # @Date: Dec 26 18:44 2020
  4 | import json
  5 | from pathlib import Path
  6 | from typing import List
  7 | 
  8 | import aiofiles
  9 | import typer
 10 | from ruia import AttrField, Item, Spider, TextField
 11 | 
 12 | from scraper.utils import mkdirs_if_not_exist, safe_filename
 13 | 
 14 | IMAGE_HOST = 'http://imgoss.cnu.cc/'
 15 | AUTHOR_RCMDS_PREFIX = 'http://www.cnu.cc/users/recommended/'
 16 | AUTHOR_WORKS_PREFIX = 'http://www.cnu.cc/users/'
 17 | WORK_PREFIX = 'http://www.cnu.cc/works/'
 18 | THUMBNAIL_SUFFIX = '?x-oss-process=style/content'
 19 | PAGE_SUFFIX = '?page={page}'
 20 | 
 21 | APP_NAME = 'CNU Scraper'
 22 | BASE_DIR = 'www.cnu.cc'
 23 | START_URLS = [
 24 |     'http://www.cnu.cc/works/{id}',  # 作品集 URL
 25 |     'http://www.cnu.cc/users/{id}',  # 用户作品页 URL
 26 |     'http://www.cnu.cc/users/recommended/{id}',  # 用户推荐页 URL
 27 | ]
 28 | DESTINATION = Path('.')
 29 | OVERWRITE = False
 30 | THUMBNAIL = False
 31 | WORKER_NUMBERS = 2
 32 | CONCURRENCY = 25
 33 | RETRIES = 3
 34 | DELAY = 0
 35 | RETRY_DELAY = 0
 36 | TIMEOUT = 20
 37 | 
 38 | 
 39 | class PageItem(Item):
 40 |     target_item = TextField(css_select='div.pager_box')
 41 |     max_page = TextField(css_select='ul>li:nth-last-child(2)', default=1)
 42 | 
 43 | 
 44 | class WorkItem(Item):
 45 |     target_item = TextField(css_select='div.work-thumbnail')
 46 |     author = TextField(css_select='div.author')
 47 |     title = TextField(css_select='div.title')  # WorkPage 中是日期
 48 |     work = AttrField(css_select='.thumbnail', attr='href')
 49 | 
 50 | 
 51 | class ImagesItem(Item):
 52 |     target_item = TextField(css_select='body')
 53 |     author = TextField(css_select='.author-info strong')
 54 |     title = TextField(css_select='.work-title')
 55 |     imgs_json = TextField(css_select='#imgs_json')
 56 | 
 57 | 
 58 | class CNUSpider(Spider):
 59 |     name = APP_NAME
 60 |     start_urls = START_URLS
 61 |     request_config = {
 62 |         'RETRIES': RETRIES,
 63 |         'DELAY': 0,
 64 |         'TIMEOUT': TIMEOUT
 65 |     }
 66 |     concurrency = CONCURRENCY
 67 |     # aiohttp config
 68 |     aiohttp_kwargs = {}
 69 | 
 70 |     def __init__(self, *args, **kwargs):
 71 |         super().__init__(*args, **kwargs)
 72 |         self._destination = DESTINATION
 73 |         self._overwrite = OVERWRITE
 74 |         self._thumbnail = THUMBNAIL
 75 |         # 更新 Spider 及自定义的配置
 76 |         for k, v in kwargs.get('spider_config', {}).items():
 77 |             setattr(self, k, v)
 78 | 
 79 |     async def parse(self, response):
 80 |         if response.url.startswith(AUTHOR_WORKS_PREFIX):
 81 |             async for page_item in PageItem.get_items(html=await response.text()):
 82 |                 for page in range(1, int(page_item.max_page) + 1):
 83 |                     page_url = f'{response.url.split("?")[0]}{PAGE_SUFFIX.format(page=page)}'
 84 |                     yield self.request(
 85 |                         url=page_url,
 86 |                         metadata={
 87 |                             'current_page': page,
 88 |                             'max_page': page_item.max_page,
 89 |                         },
 90 |                         callback=self.parse_page)
 91 |         elif response.url.startswith(WORK_PREFIX):
 92 |             yield self.parse_work(response)
 93 |         else:
 94 |             self.logger.warning(f'Parser not support URL: {response.url}')
 95 | 
 96 |     async def parse_page(self, response):
 97 |         async for work_item in WorkItem.get_items(html=await response.text()):
 98 |             yield self.request(
 99 |                 url=work_item.work,
100 |                 metadata={
101 |                     'current_page': response.metadata['current_page'],
102 |                     'max_page': response.metadata['max_page'],
103 |                     'author': work_item.author,
104 |                     'title': work_item.title,
105 |                     'work': work_item.work
106 |                 },
107 |                 callback=self.parse_work
108 |             )
109 | 
110 |     async def parse_work(self, response):
111 |         async for images_item in ImagesItem.get_items(html=await response.text()):
112 |             urls = [IMAGE_HOST + img.get('img') for img in json.loads(images_item.imgs_json)]
113 |             for index, url in enumerate(urls):
114 |                 basename = url.split('/')[-1]
115 |                 save_dir = (self._destination /
116 |                             BASE_DIR /
117 |                             safe_filename(images_item.author) /
118 |                             safe_filename(images_item.title))
119 |                 fpath = save_dir / f'[{index + 1:02d}]{basename}'
120 |                 if self._overwrite or not fpath.is_file():
121 |                     if self._thumbnail:
122 |                         url += THUMBNAIL_SUFFIX
123 |                     self.logger.info(f'Downloading {url} ...')
124 |                     yield self.request(
125 |                         url=url,
126 |                         metadata={
127 |                             'title': images_item.title,
128 |                             'index': index,
129 |                             'url': url,
130 |                             'basename': basename,
131 |                             'save_dir': save_dir,
132 |                             'fpath': fpath
133 |                         },
134 |                         callback=self.save_image
135 |                     )
136 |                 else:
137 |                     self.logger.info(f'Skipped already exists: {fpath}')
138 | 
139 |     async def save_image(self, response):
140 |         # 创建图片保存目录
141 |         save_dir = response.metadata['save_dir']
142 |         if mkdirs_if_not_exist(save_dir):
143 |             self.logger.info(f'Created directory: {save_dir}')
144 |         # 保存图片
145 |         fpath = response.metadata['fpath']
146 |         try:
147 |             content = await response.read()
148 |         except TypeError as e:
149 |             self.logger.error(e)
150 |         else:
151 |             async with aiofiles.open(fpath, 'wb') as f:
152 |                 await f.write(content)
153 |                 self.logger.info(f'Saved to {fpath}')
154 | 
155 | 
156 | def cnu_command(
157 |         start_urls: List[str] = typer.Argument(
158 |             ...,
159 |             help='URLs of the works'
160 |         ),
161 |         destination: Path = typer.Option(
162 |             DESTINATION, '-d', '--destination',
163 |             help='Destination directory to save the images'
164 |         ),
165 |         overwrite: bool = typer.Option(
166 |             OVERWRITE, '-o / -no', '--overwrite / --no-overwrite',
167 |             help='Whether to overwrite existing images'
168 |         ),
169 |         thumbnail: bool = typer.Option(
170 |             THUMBNAIL, '-t', '--thumbnail',
171 |             help='Whether to download the thumbnail images'
172 |         ),
173 |         retries: int = typer.Option(
174 |             RETRIES, '-r', '--retries',
175 |             help='Number of retries when the download fails'
176 |         ),
177 |         worker_numbers: int = typer.Option(
178 |             WORKER_NUMBERS, '-w', '--workers',
179 |             help='Number of parallel workers'
180 |         ),
181 |         concurrency: int = typer.Option(
182 |             CONCURRENCY, '-c', '--concurrency',
183 |             help='Number of concurrency'
184 |         ),
185 |         delay: int = typer.Option(
186 |             DELAY, '--delay',
187 |             help='Seconds to wait for the next request'
188 |         ),
189 |         retry_delay: int = typer.Option(
190 |             RETRY_DELAY, '--retry-delay',
191 |             help='Seconds to wait for the retry request'
192 |         ),
193 |         timeout: int = typer.Option(
194 |             TIMEOUT, '--timeout',
195 |             help='Seconds of HTTP request timeout'
196 |         ),
197 | ):
198 |     """ A scraper to download images from http://www.cnu.cc/"""
199 |     # 开始爬虫任务
200 |     CNUSpider.start(
201 |         spider_config=dict(
202 |             start_urls=list(start_urls),
203 |             request_config={
204 |                 'RETRIES': retries,
205 |                 'DELAY': delay,
206 |                 'RETRY_DELAY': retry_delay,
207 |                 'TIMEOUT': timeout
208 |             },
209 |             _destination=destination,
210 |             _overwrite=overwrite,
211 |             _thumbnail=thumbnail,
212 |             worker_numbers=worker_numbers,
213 |             concurrency=concurrency
214 |         )
215 |     )
216 | 


--------------------------------------------------------------------------------
/Pipfile.lock:
--------------------------------------------------------------------------------
  1 | {
  2 |     "_meta": {
  3 |         "hash": {
  4 |             "sha256": "9076dc5aca307d83edc51f48f924cd9fd15942e012907c64fd1164885fef70a1"
  5 |         },
  6 |         "pipfile-spec": 6,
  7 |         "requires": {
  8 |             "python_version": "3.6"
  9 |         },
 10 |         "sources": [
 11 |             {
 12 |                 "name": "pypi",
 13 |                 "url": "https://pypi.org/simple",
 14 |                 "verify_ssl": true
 15 |             }
 16 |         ]
 17 |     },
 18 |     "default": {
 19 |         "aiofiles": {
 20 |             "hashes": [
 21 |                 "sha256:bd3019af67f83b739f8e4053c6c0512a7f545b9a8d91aaeab55e6e0f9d123c27",
 22 |                 "sha256:e0281b157d3d5d59d803e3f4557dcc9a3dff28a4dd4829a9ff478adae50ca092"
 23 |             ],
 24 |             "index": "pypi",
 25 |             "version": "==0.6.0"
 26 |         },
 27 |         "aiohttp": {
 28 |             "hashes": [
 29 |                 "sha256:119feb2bd551e58d83d1b38bfa4cb921af8ddedec9fad7183132db334c3133e0",
 30 |                 "sha256:16d0683ef8a6d803207f02b899c928223eb219111bd52420ef3d7a8aa76227b6",
 31 |                 "sha256:2eb3efe243e0f4ecbb654b08444ae6ffab37ac0ef8f69d3a2ffb958905379daf",
 32 |                 "sha256:2ffea7904e70350da429568113ae422c88d2234ae776519549513c8f217f58a9",
 33 |                 "sha256:40bd1b101b71a18a528ffce812cc14ff77d4a2a1272dfb8b11b200967489ef3e",
 34 |                 "sha256:418597633b5cd9639e514b1d748f358832c08cd5d9ef0870026535bd5eaefdd0",
 35 |                 "sha256:481d4b96969fbfdcc3ff35eea5305d8565a8300410d3d269ccac69e7256b1329",
 36 |                 "sha256:4c1bdbfdd231a20eee3e56bd0ac1cd88c4ff41b64ab679ed65b75c9c74b6c5c2",
 37 |                 "sha256:5563ad7fde451b1986d42b9bb9140e2599ecf4f8e42241f6da0d3d624b776f40",
 38 |                 "sha256:58c62152c4c8731a3152e7e650b29ace18304d086cb5552d317a54ff2749d32a",
 39 |                 "sha256:5b50e0b9460100fe05d7472264d1975f21ac007b35dcd6fd50279b72925a27f4",
 40 |                 "sha256:5d84ecc73141d0a0d61ece0742bb7ff5751b0657dab8405f899d3ceb104cc7de",
 41 |                 "sha256:5dde6d24bacac480be03f4f864e9a67faac5032e28841b00533cd168ab39cad9",
 42 |                 "sha256:5e91e927003d1ed9283dee9abcb989334fc8e72cf89ebe94dc3e07e3ff0b11e9",
 43 |                 "sha256:62bc216eafac3204877241569209d9ba6226185aa6d561c19159f2e1cbb6abfb",
 44 |                 "sha256:6c8200abc9dc5f27203986100579fc19ccad7a832c07d2bc151ce4ff17190076",
 45 |                 "sha256:6ca56bdfaf825f4439e9e3673775e1032d8b6ea63b8953d3812c71bd6a8b81de",
 46 |                 "sha256:71680321a8a7176a58dfbc230789790639db78dad61a6e120b39f314f43f1907",
 47 |                 "sha256:7c7820099e8b3171e54e7eedc33e9450afe7cd08172632d32128bd527f8cb77d",
 48 |                 "sha256:7dbd087ff2f4046b9b37ba28ed73f15fd0bc9f4fdc8ef6781913da7f808d9536",
 49 |                 "sha256:822bd4fd21abaa7b28d65fc9871ecabaddc42767884a626317ef5b75c20e8a2d",
 50 |                 "sha256:8ec1a38074f68d66ccb467ed9a673a726bb397142c273f90d4ba954666e87d54",
 51 |                 "sha256:950b7ef08b2afdab2488ee2edaff92a03ca500a48f1e1aaa5900e73d6cf992bc",
 52 |                 "sha256:99c5a5bf7135607959441b7d720d96c8e5c46a1f96e9d6d4c9498be8d5f24212",
 53 |                 "sha256:b84ad94868e1e6a5e30d30ec419956042815dfaea1b1df1cef623e4564c374d9",
 54 |                 "sha256:bc3d14bf71a3fb94e5acf5bbf67331ab335467129af6416a437bd6024e4f743d",
 55 |                 "sha256:c2a80fd9a8d7e41b4e38ea9fe149deed0d6aaede255c497e66b8213274d6d61b",
 56 |                 "sha256:c44d3c82a933c6cbc21039326767e778eface44fca55c65719921c4b9661a3f7",
 57 |                 "sha256:cc31e906be1cc121ee201adbdf844522ea3349600dd0a40366611ca18cd40e81",
 58 |                 "sha256:d5d102e945ecca93bcd9801a7bb2fa703e37ad188a2f81b1e65e4abe4b51b00c",
 59 |                 "sha256:dd7936f2a6daa861143e376b3a1fb56e9b802f4980923594edd9ca5670974895",
 60 |                 "sha256:dee68ec462ff10c1d836c0ea2642116aba6151c6880b688e56b4c0246770f297",
 61 |                 "sha256:e76e78863a4eaec3aee5722d85d04dcbd9844bc6cd3bfa6aa880ff46ad16bfcb",
 62 |                 "sha256:eab51036cac2da8a50d7ff0ea30be47750547c9aa1aa2cf1a1b710a1827e7dbe",
 63 |                 "sha256:f4496d8d04da2e98cc9133e238ccebf6a13ef39a93da2e87146c8c8ac9768242",
 64 |                 "sha256:fbd3b5e18d34683decc00d9a360179ac1e7a320a5fee10ab8053ffd6deab76e0",
 65 |                 "sha256:feb24ff1226beeb056e247cf2e24bba5232519efb5645121c4aea5b6ad74c1f2"
 66 |             ],
 67 |             "index": "pypi",
 68 |             "version": "==3.7.4"
 69 |         },
 70 |         "async-timeout": {
 71 |             "hashes": [
 72 |                 "sha256:0c3c816a028d47f659d6ff5c745cb2acf1f966da1fe5c19c77a70282b25f4c5f",
 73 |                 "sha256:4291ca197d287d274d0b6cb5d6f8f8f82d434ed288f962539ff18cc9012f9ea3"
 74 |             ],
 75 |             "version": "==3.0.1"
 76 |         },
 77 |         "attrs": {
 78 |             "hashes": [
 79 |                 "sha256:31b2eced602aa8423c2aea9c76a724617ed67cf9513173fd3a4f03e3a929c7e6",
 80 |                 "sha256:832aa3cde19744e49938b91fea06d69ecb9e649c93ba974535d08ad92164f700"
 81 |             ],
 82 |             "version": "==20.3.0"
 83 |         },
 84 |         "beautifulsoup4": {
 85 |             "hashes": [
 86 |                 "sha256:4c98143716ef1cb40bf7f39a8e3eec8f8b009509e74904ba3a7b315431577e35",
 87 |                 "sha256:84729e322ad1d5b4d25f805bfa05b902dd96450f43842c4e99067d5e1369eb25",
 88 |                 "sha256:fff47e031e34ec82bf17e00da8f592fe7de69aeea38be00523c04623c04fb666"
 89 |             ],
 90 |             "index": "pypi",
 91 |             "version": "==4.9.3"
 92 |         },
 93 |         "certifi": {
 94 |             "hashes": [
 95 |                 "sha256:1a4995114262bffbc2413b159f2a1a480c969de6e6eb13ee966d470af86af59c",
 96 |                 "sha256:719a74fb9e33b9bd44cc7f3a8d94bc35e4049deebe19ba7d8e108280cfd59830"
 97 |             ],
 98 |             "version": "==2020.12.5"
 99 |         },
100 |         "chardet": {
101 |             "hashes": [
102 |                 "sha256:84ab92ed1c4d4f16916e05906b6b75a6c0fb5db821cc65e70cbd64a3e2a5eaae",
103 |                 "sha256:fc323ffcaeaed0e0a02bf4d117757b98aed530d9ed4531e3e15460124c106691"
104 |             ],
105 |             "version": "==3.0.4"
106 |         },
107 |         "click": {
108 |             "hashes": [
109 |                 "sha256:d2b5255c7c6349bc1bd1e59e08cd12acbbd63ce649f2588755783aa94dfb6b1a",
110 |                 "sha256:dacca89f4bfadd5de3d7489b7c8a566eee0d3676333fbb50030263894c38c0dc"
111 |             ],
112 |             "index": "pypi",
113 |             "version": "==7.1.2"
114 |         },
115 |         "cssselect": {
116 |             "hashes": [
117 |                 "sha256:f612ee47b749c877ebae5bb77035d8f4202c6ad0f0fc1271b3c18ad6c4468ecf",
118 |                 "sha256:f95f8dedd925fd8f54edb3d2dfb44c190d9d18512377d3c1e2388d16126879bc"
119 |             ],
120 |             "version": "==1.1.0"
121 |         },
122 |         "idna": {
123 |             "hashes": [
124 |                 "sha256:b307872f855b18632ce0c21c5e45be78c0ea7ae4c15c828c20788b26921eb3f6",
125 |                 "sha256:b97d804b1e9b523befed77c48dacec60e6dcb0b5391d57af6a65a312a90648c0"
126 |             ],
127 |             "version": "==2.10"
128 |         },
129 |         "idna-ssl": {
130 |             "hashes": [
131 |                 "sha256:a933e3bb13da54383f9e8f35dc4f9cb9eb9b3b78c6b36f311254d6d0d92c6c7c"
132 |             ],
133 |             "markers": "python_version < '3.7'",
134 |             "version": "==1.1.0"
135 |         },
136 |         "lxml": {
137 |             "hashes": [
138 |                 "sha256:0448576c148c129594d890265b1a83b9cd76fd1f0a6a04620753d9a6bcfd0a4d",
139 |                 "sha256:127f76864468d6630e1b453d3ffbbd04b024c674f55cf0a30dc2595137892d37",
140 |                 "sha256:1471cee35eba321827d7d53d104e7b8c593ea3ad376aa2df89533ce8e1b24a01",
141 |                 "sha256:2363c35637d2d9d6f26f60a208819e7eafc4305ce39dc1d5005eccc4593331c2",
142 |                 "sha256:2e5cc908fe43fe1aa299e58046ad66981131a66aea3129aac7770c37f590a644",
143 |                 "sha256:2e6fd1b8acd005bd71e6c94f30c055594bbd0aa02ef51a22bbfa961ab63b2d75",
144 |                 "sha256:366cb750140f221523fa062d641393092813b81e15d0e25d9f7c6025f910ee80",
145 |                 "sha256:42ebca24ba2a21065fb546f3e6bd0c58c3fe9ac298f3a320147029a4850f51a2",
146 |                 "sha256:4e751e77006da34643ab782e4a5cc21ea7b755551db202bc4d3a423b307db780",
147 |                 "sha256:4fb85c447e288df535b17ebdebf0ec1cf3a3f1a8eba7e79169f4f37af43c6b98",
148 |                 "sha256:50c348995b47b5a4e330362cf39fc503b4a43b14a91c34c83b955e1805c8e308",
149 |                 "sha256:535332fe9d00c3cd455bd3dd7d4bacab86e2d564bdf7606079160fa6251caacf",
150 |                 "sha256:535f067002b0fd1a4e5296a8f1bf88193080ff992a195e66964ef2a6cfec5388",
151 |                 "sha256:5be4a2e212bb6aa045e37f7d48e3e1e4b6fd259882ed5a00786f82e8c37ce77d",
152 |                 "sha256:60a20bfc3bd234d54d49c388950195d23a5583d4108e1a1d47c9eef8d8c042b3",
153 |                 "sha256:648914abafe67f11be7d93c1a546068f8eff3c5fa938e1f94509e4a5d682b2d8",
154 |                 "sha256:681d75e1a38a69f1e64ab82fe4b1ed3fd758717bed735fb9aeaa124143f051af",
155 |                 "sha256:68a5d77e440df94011214b7db907ec8f19e439507a70c958f750c18d88f995d2",
156 |                 "sha256:69a63f83e88138ab7642d8f61418cf3180a4d8cd13995df87725cb8b893e950e",
157 |                 "sha256:6e4183800f16f3679076dfa8abf2db3083919d7e30764a069fb66b2b9eff9939",
158 |                 "sha256:6fd8d5903c2e53f49e99359b063df27fdf7acb89a52b6a12494208bf61345a03",
159 |                 "sha256:791394449e98243839fa822a637177dd42a95f4883ad3dec2a0ce6ac99fb0a9d",
160 |                 "sha256:7a7669ff50f41225ca5d6ee0a1ec8413f3a0d8aa2b109f86d540887b7ec0d72a",
161 |                 "sha256:7e9eac1e526386df7c70ef253b792a0a12dd86d833b1d329e038c7a235dfceb5",
162 |                 "sha256:7ee8af0b9f7de635c61cdd5b8534b76c52cd03536f29f51151b377f76e214a1a",
163 |                 "sha256:8246f30ca34dc712ab07e51dc34fea883c00b7ccb0e614651e49da2c49a30711",
164 |                 "sha256:8c88b599e226994ad4db29d93bc149aa1aff3dc3a4355dd5757569ba78632bdf",
165 |                 "sha256:923963e989ffbceaa210ac37afc9b906acebe945d2723e9679b643513837b089",
166 |                 "sha256:94d55bd03d8671686e3f012577d9caa5421a07286dd351dfef64791cf7c6c505",
167 |                 "sha256:97db258793d193c7b62d4e2586c6ed98d51086e93f9a3af2b2034af01450a74b",
168 |                 "sha256:a9d6bc8642e2c67db33f1247a77c53476f3a166e09067c0474facb045756087f",
169 |                 "sha256:cd11c7e8d21af997ee8079037fff88f16fda188a9776eb4b81c7e4c9c0a7d7fc",
170 |                 "sha256:d8d3d4713f0c28bdc6c806a278d998546e8efc3498949e3ace6e117462ac0a5e",
171 |                 "sha256:e0bfe9bb028974a481410432dbe1b182e8191d5d40382e5b8ff39cdd2e5c5931",
172 |                 "sha256:f4822c0660c3754f1a41a655e37cb4dbbc9be3d35b125a37fab6f82d47674ebc",
173 |                 "sha256:f83d281bb2a6217cd806f4cf0ddded436790e66f393e124dfe9731f6b3fb9afe",
174 |                 "sha256:fc37870d6716b137e80d19241d0e2cff7a7643b925dfa49b4c8ebd1295eb506e"
175 |             ],
176 |             "version": "==4.6.2"
177 |         },
178 |         "multidict": {
179 |             "hashes": [
180 |                 "sha256:018132dbd8688c7a69ad89c4a3f39ea2f9f33302ebe567a879da8f4ca73f0d0a",
181 |                 "sha256:051012ccee979b2b06be928a6150d237aec75dd6bf2d1eeeb190baf2b05abc93",
182 |                 "sha256:05c20b68e512166fddba59a918773ba002fdd77800cad9f55b59790030bab632",
183 |                 "sha256:07b42215124aedecc6083f1ce6b7e5ec5b50047afa701f3442054373a6deb656",
184 |                 "sha256:0e3c84e6c67eba89c2dbcee08504ba8644ab4284863452450520dad8f1e89b79",
185 |                 "sha256:0e929169f9c090dae0646a011c8b058e5e5fb391466016b39d21745b48817fd7",
186 |                 "sha256:1ab820665e67373de5802acae069a6a05567ae234ddb129f31d290fc3d1aa56d",
187 |                 "sha256:25b4e5f22d3a37ddf3effc0710ba692cfc792c2b9edfb9c05aefe823256e84d5",
188 |                 "sha256:2e68965192c4ea61fff1b81c14ff712fc7dc15d2bd120602e4a3494ea6584224",
189 |                 "sha256:2f1a132f1c88724674271d636e6b7351477c27722f2ed789f719f9e3545a3d26",
190 |                 "sha256:37e5438e1c78931df5d3c0c78ae049092877e5e9c02dd1ff5abb9cf27a5914ea",
191 |                 "sha256:3a041b76d13706b7fff23b9fc83117c7b8fe8d5fe9e6be45eee72b9baa75f348",
192 |                 "sha256:3a4f32116f8f72ecf2a29dabfb27b23ab7cdc0ba807e8459e59a93a9be9506f6",
193 |                 "sha256:46c73e09ad374a6d876c599f2328161bcd95e280f84d2060cf57991dec5cfe76",
194 |                 "sha256:46dd362c2f045095c920162e9307de5ffd0a1bfbba0a6e990b344366f55a30c1",
195 |                 "sha256:4b186eb7d6ae7c06eb4392411189469e6a820da81447f46c0072a41c748ab73f",
196 |                 "sha256:54fd1e83a184e19c598d5e70ba508196fd0bbdd676ce159feb412a4a6664f952",
197 |                 "sha256:585fd452dd7782130d112f7ddf3473ffdd521414674c33876187e101b588738a",
198 |                 "sha256:5cf3443199b83ed9e955f511b5b241fd3ae004e3cb81c58ec10f4fe47c7dce37",
199 |                 "sha256:6a4d5ce640e37b0efcc8441caeea8f43a06addace2335bd11151bc02d2ee31f9",
200 |                 "sha256:7df80d07818b385f3129180369079bd6934cf70469f99daaebfac89dca288359",
201 |                 "sha256:806068d4f86cb06af37cd65821554f98240a19ce646d3cd24e1c33587f313eb8",
202 |                 "sha256:830f57206cc96ed0ccf68304141fec9481a096c4d2e2831f311bde1c404401da",
203 |                 "sha256:929006d3c2d923788ba153ad0de8ed2e5ed39fdbe8e7be21e2f22ed06c6783d3",
204 |                 "sha256:9436dc58c123f07b230383083855593550c4d301d2532045a17ccf6eca505f6d",
205 |                 "sha256:9dd6e9b1a913d096ac95d0399bd737e00f2af1e1594a787e00f7975778c8b2bf",
206 |                 "sha256:ace010325c787c378afd7f7c1ac66b26313b3344628652eacd149bdd23c68841",
207 |                 "sha256:b47a43177a5e65b771b80db71e7be76c0ba23cc8aa73eeeb089ed5219cdbe27d",
208 |                 "sha256:b797515be8743b771aa868f83563f789bbd4b236659ba52243b735d80b29ed93",
209 |                 "sha256:b7993704f1a4b204e71debe6095150d43b2ee6150fa4f44d6d966ec356a8d61f",
210 |                 "sha256:d5c65bdf4484872c4af3150aeebe101ba560dcfb34488d9a8ff8dbcd21079647",
211 |                 "sha256:d81eddcb12d608cc08081fa88d046c78afb1bf8107e6feab5d43503fea74a635",
212 |                 "sha256:dc862056f76443a0db4509116c5cd480fe1b6a2d45512a653f9a855cc0517456",
213 |                 "sha256:ecc771ab628ea281517e24fd2c52e8f31c41e66652d07599ad8818abaad38cda",
214 |                 "sha256:f200755768dc19c6f4e2b672421e0ebb3dd54c38d5a4f262b872d8cfcc9e93b5",
215 |                 "sha256:f21756997ad8ef815d8ef3d34edd98804ab5ea337feedcd62fb52d22bf531281",
216 |                 "sha256:fc13a9524bc18b6fb6e0dbec3533ba0496bbed167c56d0aabefd965584557d80"
217 |             ],
218 |             "version": "==5.1.0"
219 |         },
220 |         "requests": {
221 |             "hashes": [
222 |                 "sha256:27973dd4a904a4f13b263a19c866c13b92a39ed1c964655f025f3f8d3d75b804",
223 |                 "sha256:c210084e36a42ae6b9219e00e48287def368a26d03a048ddad7bfee44f75871e"
224 |             ],
225 |             "index": "pypi",
226 |             "version": "==2.25.1"
227 |         },
228 |         "ruia": {
229 |             "extras": [
230 |                 "uvloop"
231 |             ],
232 |             "hashes": [
233 |                 "sha256:79585cb92862462547959a1da299fca4c76cbe16d72bc8d4a93e65c037f2431e"
234 |             ],
235 |             "index": "pypi",
236 |             "version": "==0.8.0"
237 |         },
238 |         "soupsieve": {
239 |             "hashes": [
240 |                 "sha256:407fa1e8eb3458d1b5614df51d9651a1180ea5fedf07feb46e45d7e25e6d6cdd",
241 |                 "sha256:d3a5ea5b350423f47d07639f74475afedad48cf41c0ad7a82ca13a3928af34f6"
242 |             ],
243 |             "markers": "python_version >= '3.0'",
244 |             "version": "==2.2"
245 |         },
246 |         "termcolor": {
247 |             "hashes": [
248 |                 "sha256:1d6d69ce66211143803fbc56652b41d73b4a400a2891d7bf7a1cdf4c02de613b"
249 |             ],
250 |             "index": "pypi",
251 |             "version": "==1.1.0"
252 |         },
253 |         "typer": {
254 |             "hashes": [
255 |                 "sha256:5455d750122cff96745b0dec87368f56d023725a7ebc9d2e54dd23dc86816303",
256 |                 "sha256:ba58b920ce851b12a2d790143009fa00ac1d05b3ff3257061ff69dbdfc3d161b"
257 |             ],
258 |             "index": "pypi",
259 |             "version": "==0.3.2"
260 |         },
261 |         "typing-extensions": {
262 |             "hashes": [
263 |                 "sha256:7cb407020f00f7bfc3cb3e7881628838e69d8f3fcab2f64742a5e76b2f841918",
264 |                 "sha256:99d4073b617d30288f569d3f13d2bd7548c3a7e4c8de87db09a9d29bb3a4a60c",
265 |                 "sha256:dafc7639cde7f1b6e1acc0f457842a83e722ccca8eef5270af2d74792619a89f"
266 |             ],
267 |             "markers": "python_version < '3.8'",
268 |             "version": "==3.7.4.3"
269 |         },
270 |         "urllib3": {
271 |             "hashes": [
272 |                 "sha256:1b465e494e3e0d8939b50680403e3aedaa2bc434b7d5af64dfd3c958d7f5ae80",
273 |                 "sha256:de3eedaad74a2683334e282005cd8d7f22f4d55fa690a2a1020a416cb0a47e73"
274 |             ],
275 |             "version": "==1.26.3"
276 |         },
277 |         "uvloop": {
278 |             "hashes": [
279 |                 "sha256:1ae1ad731c8c0dcee80e0ecf06274f0f7293244d2cef81fa2747321a370a6aba",
280 |                 "sha256:236a3c31096e0845029856f7bc07a938340c2cdb35d9d39b38c9253b672bf948",
281 |                 "sha256:47ec567151070ed770211d359ad9250b59368548c60212c7ef6dda3f5b1778f6",
282 |                 "sha256:66881fe8a2187334c4dd5010c56310bdf32fe426613f9ca727f090bc31280624",
283 |                 "sha256:7846828112bfb49abc5fdfc47d0e4dfd7402115c9fde3c14c31818cfbeeb63dc",
284 |                 "sha256:9541dc3f391941796ae95c9c3bb16b813acf9e3d4beebfd3b623f1acb22d318d",
285 |                 "sha256:ca8a9e982f0bfbe331f41902cdd721c6e749e4685a403685e792b86a584f5969",
286 |                 "sha256:e178c255622d928d464187e3ceba94db88465f6b17909c651483fb73af8d8b85",
287 |                 "sha256:e72779681f839b6a069d7e7a9f7962a1d1927612c5c2e33071415478bdc1b91b",
288 |                 "sha256:ed073d24e0c383c24d17d3a2bb209b999ff0a8130e89b7c3f033db9e0c3bd04f"
289 |             ],
290 |             "version": "==0.15.1"
291 |         },
292 |         "yarl": {
293 |             "hashes": [
294 |                 "sha256:00d7ad91b6583602eb9c1d085a2cf281ada267e9a197e8b7cae487dadbfa293e",
295 |                 "sha256:0355a701b3998dcd832d0dc47cc5dedf3874f966ac7f870e0f3a6788d802d434",
296 |                 "sha256:15263c3b0b47968c1d90daa89f21fcc889bb4b1aac5555580d74565de6836366",
297 |                 "sha256:2ce4c621d21326a4a5500c25031e102af589edb50c09b321049e388b3934eec3",
298 |                 "sha256:31ede6e8c4329fb81c86706ba8f6bf661a924b53ba191b27aa5fcee5714d18ec",
299 |                 "sha256:324ba3d3c6fee56e2e0b0d09bf5c73824b9f08234339d2b788af65e60040c959",
300 |                 "sha256:329412812ecfc94a57cd37c9d547579510a9e83c516bc069470db5f75684629e",
301 |                 "sha256:4736eaee5626db8d9cda9eb5282028cc834e2aeb194e0d8b50217d707e98bb5c",
302 |                 "sha256:4953fb0b4fdb7e08b2f3b3be80a00d28c5c8a2056bb066169de00e6501b986b6",
303 |                 "sha256:4c5bcfc3ed226bf6419f7a33982fb4b8ec2e45785a0561eb99274ebbf09fdd6a",
304 |                 "sha256:547f7665ad50fa8563150ed079f8e805e63dd85def6674c97efd78eed6c224a6",
305 |                 "sha256:5b883e458058f8d6099e4420f0cc2567989032b5f34b271c0827de9f1079a424",
306 |                 "sha256:63f90b20ca654b3ecc7a8d62c03ffa46999595f0167d6450fa8383bab252987e",
307 |                 "sha256:68dc568889b1c13f1e4745c96b931cc94fdd0defe92a72c2b8ce01091b22e35f",
308 |                 "sha256:69ee97c71fee1f63d04c945f56d5d726483c4762845400a6795a3b75d56b6c50",
309 |                 "sha256:6d6283d8e0631b617edf0fd726353cb76630b83a089a40933043894e7f6721e2",
310 |                 "sha256:72a660bdd24497e3e84f5519e57a9ee9220b6f3ac4d45056961bf22838ce20cc",
311 |                 "sha256:73494d5b71099ae8cb8754f1df131c11d433b387efab7b51849e7e1e851f07a4",
312 |                 "sha256:7356644cbed76119d0b6bd32ffba704d30d747e0c217109d7979a7bc36c4d970",
313 |                 "sha256:8a9066529240171b68893d60dca86a763eae2139dd42f42106b03cf4b426bf10",
314 |                 "sha256:8aa3decd5e0e852dc68335abf5478a518b41bf2ab2f330fe44916399efedfae0",
315 |                 "sha256:97b5bdc450d63c3ba30a127d018b866ea94e65655efaf889ebeabc20f7d12406",
316 |                 "sha256:9ede61b0854e267fd565e7527e2f2eb3ef8858b301319be0604177690e1a3896",
317 |                 "sha256:b2e9a456c121e26d13c29251f8267541bd75e6a1ccf9e859179701c36a078643",
318 |                 "sha256:b5dfc9a40c198334f4f3f55880ecf910adebdcb2a0b9a9c23c9345faa9185721",
319 |                 "sha256:bafb450deef6861815ed579c7a6113a879a6ef58aed4c3a4be54400ae8871478",
320 |                 "sha256:c49ff66d479d38ab863c50f7bb27dee97c6627c5fe60697de15529da9c3de724",
321 |                 "sha256:ce3beb46a72d9f2190f9e1027886bfc513702d748047b548b05dab7dfb584d2e",
322 |                 "sha256:d26608cf178efb8faa5ff0f2d2e77c208f471c5a3709e577a7b3fd0445703ac8",
323 |                 "sha256:d597767fcd2c3dc49d6eea360c458b65643d1e4dbed91361cf5e36e53c1f8c96",
324 |                 "sha256:d5c32c82990e4ac4d8150fd7652b972216b204de4e83a122546dce571c1bdf25",
325 |                 "sha256:d8d07d102f17b68966e2de0e07bfd6e139c7c02ef06d3a0f8d2f0f055e13bb76",
326 |                 "sha256:e46fba844f4895b36f4c398c5af062a9808d1f26b2999c58909517384d5deda2",
327 |                 "sha256:e6b5460dc5ad42ad2b36cca524491dfcaffbfd9c8df50508bddc354e787b8dc2",
328 |                 "sha256:f040bcc6725c821a4c0665f3aa96a4d0805a7aaf2caf266d256b8ed71b9f041c",
329 |                 "sha256:f0b059678fd549c66b89bed03efcabb009075bd131c248ecdf087bdb6faba24a",
330 |                 "sha256:fcbb48a93e8699eae920f8d92f7160c03567b421bc17362a9ffbbd706a816f71"
331 |             ],
332 |             "version": "==1.6.3"
333 |         }
334 |     },
335 |     "develop": {}
336 | }
337 | 


--------------------------------------------------------------------------------
/scraper/zcool.py:
--------------------------------------------------------------------------------
  1 | # @AUTHOR: lonsty
  2 | # @DATE:   2019-09-07 18:34:18
  3 | import json
  4 | import math
  5 | import os.path as op
  6 | import re
  7 | import sys
  8 | import threading
  9 | import time
 10 | from collections import namedtuple
 11 | from concurrent.futures import ThreadPoolExecutor, as_completed, wait
 12 | from datetime import datetime
 13 | from pathlib import Path
 14 | from queue import Empty, Queue
 15 | from typing import List
 16 | from urllib.parse import urljoin, urlparse
 17 | from uuid import uuid4
 18 | 
 19 | import click
 20 | import requests
 21 | from bs4 import BeautifulSoup
 22 | from termcolor import colored, cprint
 23 | 
 24 | from scraper.utils import (mkdirs_if_not_exist, parse_resources, retry,
 25 |                            safe_filename, sort_records)
 26 | 
 27 | Scrapy = namedtuple('Scrapy', 'type author title objid index url')  # 用于记录下载任务
 28 | HEADERS = {
 29 |     'user-agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 '
 30 |                   '(KHTML, like Gecko) Chrome/83.0.4103.116 Safari/537.36'
 31 | }
 32 | HOST_PAGE = 'https://www.zcool.com.cn'
 33 | SEARCH_DESIGNER_SUFFIX = '/search/designer?&word={word}'
 34 | USER_SUFFIX = '/u/{id}'
 35 | PAGE_SUFFIX = '?myCate=0&sort=1&p={page}'
 36 | WORK_SUFFIX = '/work/content/show?p=1&objectId={objid}'
 37 | COLLECTION_SUFFIX = '/collection/contents?id={objid}&p={page}&pageSize=25'
 38 | USER_API = 'https://www.zcool.com.cn/member/card/{id}'
 39 | TIMEOUT = 30
 40 | Q_TIMEOUT = 1
 41 | MAX_WORKERS = 20
 42 | RETRIES = 3
 43 | 
 44 | thread_local = threading.local()
 45 | 
 46 | 
 47 | def get_session():
 48 |     """使线程获取同一个 Session，可减少 TCP 连接数，加速请求。
 49 | 
 50 |     :return requests.Session: session
 51 |     """
 52 |     if not hasattr(thread_local, "session"):
 53 |         thread_local.session = requests.Session()
 54 |     return thread_local.session
 55 | 
 56 | 
 57 | @retry(Exception, tries=RETRIES)
 58 | def session_request(url: str, method: str = 'GET') -> requests.Response:
 59 |     """使用 session 请求数据。使用了装饰器 retry，在网络异常导致错误时会重试。
 60 | 
 61 |     :param str url: 目标请求 URL
 62 |     :param str method: 请求方式
 63 |     :return requests.Response: 响应数据
 64 |     """
 65 |     resp = get_session().request(method, url, headers=HEADERS, timeout=TIMEOUT)
 66 |     resp.raise_for_status()
 67 |     return resp
 68 | 
 69 | 
 70 | class ZCoolScraper():
 71 | 
 72 |     def __init__(self, user_id=None, username=None, collection=None, destination=None,
 73 |                  max_pages=None, spec_topics=None, max_topics=None, max_workers=None,
 74 |                  retries=None, redownload=None, overwrite=False, thumbnail=False):
 75 |         """初始化下载参数。
 76 | 
 77 |         :param int user_id: 用户 ID
 78 |         :param str username: 用户名
 79 |         :param HttpUrl collection: 收藏集 URL
 80 |         :param str destination: 图片保存到本地的路径，默认当前路径
 81 |         :param int max_pages: 最大爬取页数，默认所有
 82 |         :param list spec_topics: 需要下载的特定主题
 83 |         :param int max_topics: 最大下载主题数量，默认所有
 84 |         :param int max_workers: 线程开启个数，默认 20
 85 |         :param int retries: 请求异常时的重试次数，默认 3
 86 |         :param str redownload: 下载记录文件，给定此文件则从失败记录进行下载
 87 |         :param bool overwrite: 是否覆盖已存在的文件，默认 False
 88 |         :param bool thumbnail: 是否下载缩略图，默认 False
 89 |         """
 90 |         self.start_time = datetime.now()
 91 |         print(f' - - - - - -+-+ {self.start_time.ctime()} +-+- - - - - -\n')
 92 |         self.collection = collection
 93 |         self.spec_topics = spec_topics
 94 |         self.max_topics = max_topics or 'all'
 95 |         self.max_workers = max_workers or MAX_WORKERS
 96 |         self.pool = ThreadPoolExecutor(self.max_workers)
 97 |         self.overwrite = overwrite
 98 |         self.thumbnail = thumbnail
 99 |         self.pages = Queue()
100 |         self.topics = Queue()
101 |         self.images = Queue()
102 |         self.stat = {
103 |             'npages': 0,
104 |             'ntopics': 0,
105 |             'nimages': 0,
106 |             'pages_pass': set(),
107 |             'pages_fail': set(),
108 |             'topics_pass': set(),
109 |             'topics_fail': set(),
110 |             'images_pass': set(),
111 |             'images_fail': set()
112 |         }
113 | 
114 |         if retries:
115 |             # 重置全局变量 RETRIES
116 |             global RETRIES
117 |             RETRIES = retries
118 | 
119 |         dest = Path(destination or '', urlparse(HOST_PAGE).netloc)
120 | 
121 |         # 从记录文件中的失败项开始下载
122 |         if redownload:
123 |             self.username = self.reload_records(redownload)
124 |             self.user_id = self.search_id_by_username(self.username)
125 |             self.max_pages = self.pages.qsize()
126 |             self.max_topics = self.topics.qsize()
127 |             self.directory = dest / safe_filename(self.username)
128 |             self.stat.update({
129 |                 'npages': self.max_pages,
130 |                 'ntopics': self.max_topics,
131 |                 'nimages': self.images.qsize()
132 |             })
133 |             print(f'{"Username".rjust(17)}: {colored(self.username, "cyan")}\n'
134 |                   f'{"User ID".rjust(17)}: {self.user_id}\n'
135 |                   f'{"Pages to scrapy".rjust(17)}: {self.max_pages:2d}\n'
136 |                   f'{"Topics to scrapy".rjust(17)}: {self.max_topics:3d}\n'
137 |                   f'{"Images to scrapy".rjust(17)}: {self.images.qsize():4d}\n'
138 |                   f'Storage directory: {colored(self.directory, attrs=["underline"])}', end='\n\n')
139 |             self.fetch_all(initialized=True)
140 |             return
141 | 
142 |         # 从收藏集下载
143 |         if collection:
144 |             objid = self.parse_objid(collection, is_collection=True)
145 |             resp = session_request(urljoin(HOST_PAGE, COLLECTION_SUFFIX.format(objid=objid, page=1)))
146 |             data = resp.json().get('data', {})
147 |             total = data.get('total', 0)
148 |             page_size = data.get('pageable', {}).get('pageSize')
149 |             max_pages_ = math.ceil(total / page_size)
150 |             self.max_pages = min(max_pages or 9999, max_pages_)
151 |             self.directory = dest / safe_filename(f'{self.username}-{self._collection_name}')
152 |             self.parse_collection_topics(data.get('content'))
153 | 
154 |             # 解析第 2 页 至 最大页的 topic 到下载任务
155 |             for page in range(2, self.max_pages + 1):
156 |                 resp = session_request(urljoin(HOST_PAGE, COLLECTION_SUFFIX.format(objid=objid, page=page)))
157 |                 self.parse_collection_topics(topics=resp.json().get('data', {}).get('content'),
158 |                                              offset=page_size * (page - 1))
159 | 
160 |         # 根据用户 ID 或用户名下载
161 |         else:
162 |             self.user_id = user_id or self.search_id_by_username(username)
163 |             self.base_url = urljoin(HOST_PAGE, USER_SUFFIX.format(id=self.user_id))
164 | 
165 |             try:
166 |                 response = session_request(self.base_url)
167 |             except requests.exceptions.ProxyError:
168 |                 cprint('Cannot connect to proxy.', 'red')
169 |                 sys.exit(1)
170 |             except Exception as e:
171 |                 cprint(f'Failed to connect to {self.base_url}, {e}', 'red')
172 |                 sys.exit(1)
173 | 
174 |             soup = BeautifulSoup(markup=response.text, features='html.parser')
175 |             try:
176 |                 author = soup.find(name='div', id='body').get('data-name')
177 |                 if username and username != author:
178 |                     cprint(f'Invalid user id:「{user_id}」or username:「{username}」!', 'red')
179 |                     sys.exit(1)
180 |                 self.username = author
181 |             except Exception:
182 |                 self.username = username or 'anonymous'
183 |             self.directory = dest / safe_filename(self.username)
184 |             try:
185 |                 max_pages_ = int(soup.find(id='laypage_0').find_all(name='a')[-2].text)
186 |             except Exception:
187 |                 max_pages_ = 1
188 |             self.max_pages = min(max_pages or 9999, max_pages_)
189 | 
190 |         if self.spec_topics:
191 |             topics = ', '.join(self.spec_topics)
192 |         elif self.max_topics == 'all':
193 |             topics = 'all'
194 |         else:
195 |             topics = self.max_pages * self.max_topics
196 |         print(f'{"Username".rjust(17)}: {colored(self.username, "cyan")}\n'
197 |               f'{"User ID".rjust(17)}: {self.user_id}\n'
198 |               f'{"Maximum pages".rjust(17)}: {max_pages_}\n'
199 |               f'{"Pages to scrapy".rjust(17)}: {self.max_pages}\n'
200 |               f'{"Topics to scrapy".rjust(17)}: {topics}\n'
201 |               f'Storage directory: {colored(self.directory, attrs=["underline"])}', end='\n\n')
202 | 
203 |         self.END_PARSING_TOPICS = False
204 |         self.fetch_all(initialized=True if self.collection else False)
205 | 
206 |     def search_id_by_username(self, username):
207 |         """通过用户昵称查找用户 ID。
208 | 
209 |         :param str username: 用户昵称
210 |         :return int: 用户 ID
211 |         """
212 |         if not username:
213 |             cprint('Must give an <user id> or <username>!', 'yellow')
214 |             sys.exit(1)
215 | 
216 |         search_url = urljoin(HOST_PAGE, SEARCH_DESIGNER_SUFFIX.format(word=username))
217 |         try:
218 |             response = session_request(search_url)
219 |         except requests.exceptions.ProxyError:
220 |             cprint('Cannot connect to proxy.', 'red')
221 |             sys.exit(1)
222 |         except Exception as e:
223 |             cprint(f'Failed to connect to {search_url}, {e}', 'red')
224 |             sys.exit(1)
225 | 
226 |         author_1st = BeautifulSoup(response.text, 'html.parser').find(name='div', class_='author-info')
227 |         if (not author_1st) or (author_1st.get('data-name') != username):
228 |             cprint(f'Username「{username}」does not exist!', 'yellow')
229 |             sys.exit(1)
230 | 
231 |         return author_1st.get('data-id')
232 | 
233 |     def reload_records(self, file):
234 |         """从本地下载记录里读取下载失败的内容。
235 | 
236 |         :param str file: 下载记录文件的路径。
237 |         :return str: 用户名
238 |         """
239 |         with open(file, 'r', encoding='utf-8') as f:
240 |             for fail in json.loads(f.read()).get('fail'):
241 |                 scrapy = Scrapy._make(fail.values())
242 |                 if scrapy.type == 'page':
243 |                     self.pages.put(scrapy)
244 |                 elif scrapy.type == 'topic':
245 |                     self.topics.put(scrapy)
246 |                 elif scrapy.type == 'image':
247 |                     self.images.put(scrapy)
248 |             return scrapy.author
249 | 
250 |     def generate_pages(self):
251 |         """根据最大下载页数，生成需要爬取主页的任务。"""
252 |         for page in range(1, self.max_pages + 1):
253 |             suffix = COLLECTION_SUFFIX if self.collection else PAGE_SUFFIX
254 |             url = urljoin(self.base_url, suffix.format(page=page))
255 |             scrapy = Scrapy(type='page', author=self.username, title=page,
256 |                             objid=None, index=page - 1, url=url)
257 |             if scrapy not in self.stat["pages_pass"]:
258 |                 self.pages.put(scrapy)
259 | 
260 |     def parse_collection_topics(self, topics: List[dict], offset: int = 0):
261 |         for idx, topic in enumerate(topics):
262 |             new_scrapy = Scrapy(type='topic',
263 |                                 author=topic.get('creatorObj', {}).get('username'),
264 |                                 title=topic.get('title'),
265 |                                 objid=topic.get('id'),
266 |                                 index=offset + idx,
267 |                                 url=topic.get('pageUrl'))
268 |             if new_scrapy not in self.stat["topics_pass"]:
269 |                 self.topics.put(new_scrapy)
270 |                 self.stat["ntopics"] += 1
271 | 
272 |     def parse_topics(self, scrapy):
273 |         """爬取主页，解析所有 topic，并将爬取主题的任务添加到任务队列。
274 | 
275 |         :param scrapy: 记录任务信息的数据体
276 |         :return Scrapy: 记录任务信息的数据体
277 |         """
278 |         resp = session_request(scrapy.url)
279 |         cards = BeautifulSoup(resp.text, 'html.parser').find_all(name='a', class_='card-img-hover')
280 |         for idx, card in enumerate(cards if self.max_topics == 'all' else cards[:self.max_topics + 1]):
281 |             title = card.get('title')
282 |             if self.spec_topics and (title not in self.spec_topics):
283 |                 continue
284 | 
285 |             new_scrapy = Scrapy(type='topic', author=scrapy.author, title=title,
286 |                                 objid=None, index=idx, url=card.get('href'))
287 |             if new_scrapy not in self.stat["topics_pass"]:
288 |                 self.topics.put(new_scrapy)
289 |                 self.stat["ntopics"] += 1
290 |         return scrapy
291 | 
292 |     def fetch_topics(self):
293 |         """从任务队列中获取要爬取的主页，使用多线程处理得到需要爬取的主题。"""
294 |         page_futures = {}
295 |         while True:
296 |             try:
297 |                 scrapy = self.pages.get(timeout=Q_TIMEOUT)
298 |                 page_futures[self.pool.submit(self.parse_topics, scrapy)] = scrapy
299 |             except Empty:
300 |                 break
301 |             except Exception:
302 |                 continue
303 | 
304 |         for future in as_completed(page_futures):
305 |             scrapy = page_futures.get(future)
306 |             try:
307 |                 future.result()
308 |                 self.stat["pages_pass"].add(scrapy)
309 |             except Exception:
310 |                 self.stat["pages_fail"].add(scrapy)
311 |                 cprint(f'GET page: {scrapy.title} ({scrapy.url}) failed.', 'red')
312 |         self.END_PARSING_TOPICS = True
313 | 
314 |     def parse_objid(self, url: str, is_collection: bool = False) -> str:
315 |         """根据 topic 页面解析 objid
316 | 
317 |         :param url: topic 或 collection 的 URL
318 |         :return: objid
319 |         """
320 |         soup = BeautifulSoup(session_request(url).text, 'html.parser')
321 |         objid = soup.find('input', id='dataInput').attrs.get('data-objid')
322 |         if is_collection:
323 |             self._collection_name = soup.find('h2', class_='title-h2').text
324 |             user = soup.find(name='span', class_='details-user-avatar')
325 |             self.user_id = user.find('div').attrs.get('data-id')
326 |             self.username = user.find('a').attrs.get('title')
327 |         return objid
328 | 
329 |     def parse_images(self, scrapy):
330 |         """爬取 topic，获得 objid 后直接调用 API，从返回数据里获得图片地址等信息，
331 | 
332 |         并将下载图片的任务添加到任务队列。
333 |         :param scrapy: 记录任务信息的数据体
334 |         :return Scrapy: 记录任务信息的数据体
335 |         """
336 |         objid = scrapy.objid or self.parse_objid(scrapy.url)
337 |         resp = session_request(urljoin(HOST_PAGE, WORK_SUFFIX.format(objid=objid)))
338 |         data = resp.json().get('data', {})
339 |         author = data.get('product', {}).get('creatorObj', {}).get('username')
340 |         title = data.get('product', {}).get('title')
341 |         objid = data.get('product', {}).get('id')
342 | 
343 |         for img in data.get('allImageList', []):
344 |             new_scrapy = Scrapy(type='image', author=author, title=title,
345 |                                 objid=objid, index=img.get('orderNo') or 0, url=img.get('url'))
346 |             if new_scrapy not in self.stat["images_pass"]:
347 |                 self.images.put(new_scrapy)
348 |                 self.stat["nimages"] += 1
349 |         return scrapy
350 | 
351 |     def fetch_images(self):
352 |         """从任务队列中获取要爬取的主题，使用多线程处理得到需要下载的图片。"""
353 |         image_futures = {}
354 |         while True:
355 |             try:
356 |                 scrapy = self.topics.get(timeout=Q_TIMEOUT)
357 |                 image_futures[self.pool.submit(self.parse_images, scrapy)] = scrapy
358 |             except Empty:
359 |                 if self.END_PARSING_TOPICS:
360 |                     break
361 |             except Exception:
362 |                 continue
363 | 
364 |         for future in as_completed(image_futures):
365 |             scrapy = image_futures.get(future)
366 |             try:
367 |                 future.result()
368 |                 self.stat["topics_pass"].add(scrapy)
369 |             except Exception:
370 |                 self.stat["topics_fail"].add(scrapy)
371 |                 cprint(f'GET topic: {scrapy.title} ({scrapy.url}) failed.', 'red')
372 | 
373 |     def fetch_all(self, initialized: bool = False):
374 |         """同时爬取主页、主题，并更新状态。"""
375 |         if not initialized:
376 |             self.generate_pages()
377 |         fetch_futures = [self.pool.submit(self.fetch_topics),
378 |                          self.pool.submit(self.fetch_images)]
379 |         end_show_fetch = False
380 |         t = threading.Thread(target=self.show_fetch_status, kwargs={'end': lambda: end_show_fetch})
381 |         t.start()
382 |         try:
383 |             wait(fetch_futures)
384 |         except KeyboardInterrupt:
385 |             raise
386 |         finally:
387 |             end_show_fetch = True
388 |             t.join()
389 | 
390 |     def show_fetch_status(self, interval=0.5, end=None):
391 |         """用于后台线程，实现边爬取边显示状态。
392 | 
393 |         :param int interval: 状态更新间隔，秒
394 |         :param function end: 用于控制退出线程
395 |         """
396 |         while True:
397 |             status = 'Fetched Pages: {pages}\tTopics: {topics}\tImages: {images}'.format(
398 |                 pages=colored(str(self.max_pages).rjust(3), 'blue'),
399 |                 topics=colored(str(self.stat["ntopics"]).rjust(3), 'blue'),
400 |                 images=colored(str(self.stat["nimages"]).rjust(5), 'blue'))
401 |             print(status, end='\r', flush=True)
402 |             if (interval == 0) or (end and end()):
403 |                 print('\n')
404 |                 break
405 |             time.sleep(interval)
406 | 
407 |     def show_download_status(self, interval=0.5, end=None):
408 |         """用于后台线程，实现边下载边显示状态。
409 | 
410 |         :param int interval: 状态更新间隔，秒
411 |         :param function end: 用于控制退出线程
412 |         """
413 |         while True:
414 |             completed = len(self.stat["images_pass"]) + len(self.stat["images_fail"])
415 |             if self.stat["nimages"] > 0:
416 |                 status = 'Time used: {time_used}\tFailed: {failed}\tCompleted: {completed}'.format(
417 |                     time_used=colored(str(datetime.now() - self.start_time)[:-7], 'yellow'),
418 |                     failed=colored(str(len(self.stat["images_fail"])).rjust(3), 'red'),
419 |                     completed=colored(str(int(completed / self.stat["nimages"] * 100))
420 |                                       + f'% ({completed}/{self.stat["nimages"]})', 'green'))
421 |                 print(status, end='\r', flush=True)
422 |             if (interval == 0) or (end and end()):
423 |                 if self.stat["nimages"] > 0:
424 |                     print('\n')
425 |                 break
426 |             time.sleep(interval)
427 | 
428 |     def download_image(self, scrapy):
429 |         """下载图片保存到本地。
430 | 
431 |          :param scrapy: 记录任务信息的数据体
432 |          :return Scrapy: 记录任务信息的数据体
433 |          """
434 |         try:
435 |             name = re.findall(r'(?<=/)\w*?\.(?:jpg|gif|png|bmp)', scrapy.url, re.IGNORECASE)[0]
436 |         except IndexError:
437 |             name = uuid4().hex + '.jpg'
438 | 
439 |         path = self.directory / safe_filename(scrapy.title)
440 |         filename = path / f'[{scrapy.index + 1 or 0:02d}]{name}'
441 |         if (not self.overwrite) and op.isfile(filename):
442 |             return scrapy
443 | 
444 |         url = scrapy.url
445 |         if self.thumbnail:
446 |             if url.lower().endswith(('jpg', 'png', 'bmp')):
447 |                 url = f'{scrapy.url}@1280w_1l_2o_100sh.{url[-3:]}'
448 |         resp = session_request(url)
449 | 
450 |         mkdirs_if_not_exist(path)
451 |         with open(filename, 'wb') as f:
452 |             for chunk in resp.iter_content(8192):
453 |                 f.write(chunk)
454 |         return scrapy
455 | 
456 |     def save_records(self):
457 |         """将成功及失败的下载记录保存到本地文件。
458 | 
459 |         :return str: 记录文件的路径
460 |         """
461 |         filename = f'{safe_filename(self.start_time.isoformat()[:-7])}.json'
462 |         abspath = op.abspath(self.directory / filename)
463 |         with open(abspath, 'w', encoding='utf-8') as f:
464 |             success = (self.stat["pages_pass"] | self.stat["topics_pass"] | self.stat["images_pass"])
465 |             fail = (self.stat["pages_fail"] | self.stat["topics_fail"] | self.stat["images_fail"])
466 |             type_order = {'page': 1, 'topic': 2, 'image': 3}
467 |             s_ordered = sort_records(success, order=type_order)
468 |             f_ordered = sort_records(fail, order=type_order)
469 | 
470 |             records = {
471 |                 'time': self.start_time.isoformat(),
472 |                 'success': [scrapy._asdict() for scrapy in s_ordered],
473 |                 'fail': [scrapy._asdict() for scrapy in f_ordered]
474 |             }
475 |             f.write(json.dumps(records, ensure_ascii=False, indent=2))
476 |         return abspath
477 | 
478 |     def run_scraper(self):
479 |         """使用多线程下载所有图片，完成后保存记录并退出程序。"""
480 |         end_show_download = False
481 |         t = threading.Thread(target=self.show_download_status, kwargs={'end': lambda: end_show_download})
482 |         t.start()
483 | 
484 |         image_futuress = {}
485 |         while True:
486 |             try:
487 |                 scrapy = self.images.get_nowait()
488 |                 if scrapy not in self.stat["images_pass"]:
489 |                     image_futuress[self.pool.submit(self.download_image, scrapy)] = scrapy
490 |             except Empty:
491 |                 break
492 |             except KeyboardInterrupt:
493 |                 raise
494 |             except Exception:
495 |                 continue
496 | 
497 |         try:
498 |             for future in as_completed(image_futuress):
499 |                 scrapy = image_futuress.get(future)
500 |                 try:
501 |                     future.result()
502 |                     self.stat["images_pass"].add(scrapy)
503 |                 except Exception:
504 |                     self.stat["images_fail"].add(scrapy)
505 |                     cprint(f'Download image: {scrapy.title}[{scrapy.index + 1}] '
506 |                            f'({scrapy.url}) failed.', 'red')
507 |         except KeyboardInterrupt:
508 |             raise
509 |         finally:
510 |             end_show_download = True
511 |             t.join()
512 | 
513 |         saved_images = len(self.stat["images_pass"])
514 |         failed_images = len(self.stat["images_fail"])
515 |         if saved_images or failed_images:
516 |             if saved_images:
517 |                 print(f'Saved {colored(saved_images, "green")} images to '
518 |                       f'{colored(self.directory.absolute(), attrs=["underline"])}')
519 |             records_path = self.save_records()
520 |             print(f'Saved records to {colored(records_path, attrs=["underline"])}')
521 |         else:
522 |             cprint('No images to download.', 'yellow')
523 | 
524 | 
525 | @click.command()
526 | @click.option('-u', '--usernames', 'names', help='One or more user names, separated by commas.')
527 | @click.option('-i', '--ids', 'ids', help='One or more user IDs, separated by commas.')
528 | @click.option('-c', '--collections', 'collections', help='One or more collection URLs, separated by commas.')
529 | @click.option('-t', '--topics', 'topics', help='Specific topics to download, separated by commas.')
530 | @click.option('-d', '--destination', 'destination', help='Destination to save images.')
531 | @click.option('-R', '--retries', 'retries', default=RETRIES, show_default=True, type=int,
532 |               help='Repeat download for failed images.')
533 | @click.option('-r', '--redownload', 'redownload',
534 |               help='Redownload images from failed records (PATH of the .json file).')
535 | @click.option('-o', '--overwrite', 'overwrite', is_flag=True, default=False, help='Override the existing files.')
536 | @click.option('--thumbnail', 'thumbnail', is_flag=True, default=False,
537 |               help='Download thumbnails with a maximum width of 1280px.')
538 | @click.option('--max-pages', 'max_pages', type=int, help='Maximum pages to download.')
539 | @click.option('--max-topics', 'max_topics', type=int, help='Maximum topics per page to download.')
540 | @click.option('--max-workers', 'max_workers', default=MAX_WORKERS, show_default=True, type=int,
541 |               help='Maximum thread workers.')
542 | def zcool_command(ids, names, collections, destination, max_pages, topics, max_topics,
543 |                   max_workers, retries, redownload, overwrite, thumbnail):
544 |     """ZCool picture crawler, download pictures, photos and illustrations of
545 |     ZCool (https://zcool.com.cn/). Visit https://github.com/lonsty/scraper.
546 |     """
547 |     if redownload:
548 |         scraper = ZCoolScraper(destination=destination, max_pages=max_pages, spec_topics=topics,
549 |                                max_topics=max_topics, max_workers=max_workers, retries=retries,
550 |                                redownload=redownload, overwrite=overwrite, thumbnail=thumbnail)
551 |         scraper.run_scraper()
552 | 
553 |     elif any([ids, names, collections]):
554 |         topics = topics.split(',') if topics else []
555 |         resources = parse_resources(ids, names, collections)
556 |         for res in resources:
557 |             scraper = ZCoolScraper(user_id=res.id, username=res.name, collection=res.collection,
558 |                                    destination=destination, max_pages=max_pages, spec_topics=topics,
559 |                                    max_topics=max_topics, max_workers=max_workers, retries=retries,
560 |                                    redownload=redownload, overwrite=overwrite)
561 |             scraper.run_scraper()
562 | 
563 |     else:
564 |         click.echo('Try "python zcool.py --help" for help.')
565 |         return 1
566 |     return 0
567 | 


--------------------------------------------------------------------------------