├── docs
├── get_cookie.png
├── screenshot_1.png
├── screenshot_2.png
├── get_cookie.md
└── README_CN.md
├── requirements.txt
├── main.py
├── weibo_image_spider
├── __init__.py
├── exceptions.py
├── models.py
├── constants.py
├── utils.py
├── cli.py
└── spider_workers.py
├── Pipfile
├── LICENSE
├── .gitignore
├── README.md
└── Pipfile.lock
/docs/get_cookie.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lonsty/weibo-image-spider/HEAD/docs/get_cookie.png
--------------------------------------------------------------------------------
/docs/screenshot_1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lonsty/weibo-image-spider/HEAD/docs/screenshot_1.png
--------------------------------------------------------------------------------
/docs/screenshot_2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lonsty/weibo-image-spider/HEAD/docs/screenshot_2.png
--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | beautifulsoup4==4.8.2
2 | click==7.1.1
3 | termcolor==1.1.0
4 | requests==2.23.0
5 | pydantic==1.4
6 |
--------------------------------------------------------------------------------
/main.py:
--------------------------------------------------------------------------------
1 | # @AUTHOR : lonsty
2 | # @DATE : 2020/3/28 14:24
3 | from weibo_image_spider.cli import weibo_command
4 |
5 | if __name__ == "__main__":
6 | weibo_command()
7 |
--------------------------------------------------------------------------------
/weibo_image_spider/__init__.py:
--------------------------------------------------------------------------------
1 | # @AUTHOR : lonsty
2 | # @DATE : 2020/3/28 14:22
3 | from .spider_workers import crawl_worker, download_worker, query_user_by_name
4 |
5 | __author__ = "Allen Shaw"
6 | __version__ = "0.1.0"
7 |
8 | __all__ = []
9 |
--------------------------------------------------------------------------------
/Pipfile:
--------------------------------------------------------------------------------
1 | [[source]]
2 | name = "pypi"
3 | url = "https://pypi.org/simple"
4 | verify_ssl = true
5 |
6 | [dev-packages]
7 |
8 | [packages]
9 | beautifulsoup4 = "==4.8.2"
10 | click = "==7.1.1"
11 | termcolor = "==1.1.0"
12 | requests = "==2.23.0"
13 | pydantic = "==1.4"
14 |
15 | [requires]
16 | python_version = "3.6"
17 |
--------------------------------------------------------------------------------
/weibo_image_spider/exceptions.py:
--------------------------------------------------------------------------------
1 | # @AUTHOR : lonsty
2 | # @DATE : 2020/3/28 18:01
3 |
4 |
5 | class CookiesExpiredException(Exception):
6 | pass
7 |
8 |
9 | class NoImagesException(Exception):
10 | pass
11 |
12 |
13 | class ContentParserError(Exception):
14 | pass
15 |
16 |
17 | class UserNotFound(Exception):
18 | pass
19 |
--------------------------------------------------------------------------------
/docs/get_cookie.md:
--------------------------------------------------------------------------------
1 | ## 获取网页版微博的 cookie:
2 |
3 | 1. 前往微博主页 [https://www.weibo.com/](https://www.weibo.com/),并使用个人账号完成登录;
4 |
5 | 2. 以 Google Chrome 浏览器为例,按 F12 打开开发者模式,依次点击「Network」→「XHR」,然后按 F5 刷新,在 XHR 记录中随便选中一条,点开 Headers,复制 Request Headers 中的 Cookie 值,将其粘贴在根目录的 [cookie](../cookie) 文件中,覆盖原内容即可。
6 |
7 | 
8 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | MIT License
2 |
3 | Copyright (c) 2020 Allen Shaw
4 |
5 | Permission is hereby granted, free of charge, to any person obtaining a copy
6 | of this software and associated documentation files (the "Software"), to deal
7 | in the Software without restriction, including without limitation the rights
8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 |
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 |
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 |
--------------------------------------------------------------------------------
/weibo_image_spider/models.py:
--------------------------------------------------------------------------------
1 | # @AUTHOR : lonsty
2 | # @DATE : 2020/3/28 15:30
3 | import time
4 | from datetime import datetime
5 | from queue import Queue
6 |
7 | from pydantic import BaseModel
8 | from termcolor import colored
9 |
10 | downloading_jobs = Queue()
11 | appointment_jobs = Queue()
12 |
13 |
14 | class User(BaseModel):
15 | name = ""
16 | uid: int = 0
17 | host: str = ""
18 |
19 |
20 | class PhotoAPI(BaseModel):
21 | action_data: str = ""
22 | page_id: int = 0
23 | page: int = 1
24 |
25 | @property
26 | def api(self):
27 | return (
28 | f"https://weibo.com/p/aj/album/loading?ajwvr=6&{self.action_data}"
29 | f"&page_id={self.page_id}&page={self.page}&ajax_call=1&__rnd={self.rnd}"
30 | )
31 |
32 | @property
33 | def rnd(self):
34 | return int(time.time() * 1000)
35 |
36 |
37 | class Parameters(BaseModel):
38 | nickname = ""
39 | uid: int = 0
40 | destination: str
41 | overwrite: bool
42 | thumbnail: bool
43 | max_images: int
44 | max_workers: int
45 | verbose: bool
46 |
47 |
48 | class Status(BaseModel):
49 | succeed = []
50 | failed = []
51 | start_time = datetime.now()
52 |
53 | @property
54 | def total_complete(self):
55 | return len(self.succeed) + len(self.failed)
56 |
57 | @property
58 | def start_time_repr(self):
59 | return self.start_time.ctime()
60 |
61 | @property
62 | def time_used(self):
63 | return str(datetime.now() - self.start_time)[:-7]
64 |
65 | @property
66 | def fmt_status(self):
67 | return (
68 | f'[Succeed: {colored(str(len(self.succeed)), "green")}, '
69 | f'Failed: {colored(str(len(self.failed)), "red")}]'
70 | )
71 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | # Byte-compiled / optimized / DLL files
2 | __pycache__/
3 | *.py[cod]
4 | *$py.class
5 |
6 | # C extensions
7 | *.so
8 |
9 | # Distribution / packaging
10 | .Python
11 | build/
12 | develop-eggs/
13 | dist/
14 | downloads/
15 | eggs/
16 | .eggs/
17 | lib/
18 | lib64/
19 | parts/
20 | sdist/
21 | var/
22 | wheels/
23 | *.egg-info/
24 | .installed.cfg
25 | *.egg
26 | MANIFEST
27 |
28 | # PyInstaller
29 | # Usually these files are written by a python script from a template
30 | # before PyInstaller builds the exe, so as to inject date/other infos into it.
31 | *.manifest
32 | *.spec
33 |
34 | # Installer logs
35 | pip-log.txt
36 | pip-delete-this-directory.txt
37 |
38 | # Unit test / coverage reports
39 | htmlcov/
40 | .tox/
41 | .coverage
42 | .coverage.*
43 | .cache
44 | nosetests.xml
45 | coverage.xml
46 | *.cover
47 | .hypothesis/
48 | .pytest_cache/
49 |
50 | # Translations
51 | *.mo
52 | *.pot
53 |
54 | # Django stuff:
55 | *.log
56 | local_settings.py
57 | db.sqlite3
58 |
59 | # Flask stuff:
60 | instance/
61 | .webassets-cache
62 |
63 | # Scrapy stuff:
64 | .scrapy
65 |
66 | # Sphinx documentation
67 | docs/_build/
68 |
69 | # PyBuilder
70 | target/
71 |
72 | # Jupyter Notebook
73 | .ipynb_checkpoints
74 |
75 | # pyenv
76 | .python-version
77 |
78 | # celery beat schedule file
79 | celerybeat-schedule
80 |
81 | # SageMath parsed files
82 | *.sage.py
83 |
84 | # Environments
85 | .env
86 | .venv
87 | env/
88 | venv/
89 | ENV/
90 | env.bak/
91 | venv.bak/
92 |
93 | # Spyder project settings
94 | .spyderproject
95 | .spyproject
96 |
97 | # Rope project settings
98 | .ropeproject
99 |
100 | # mkdocs documentation
101 | /site
102 |
103 | # mypy
104 | .mypy_cache/
105 |
106 | .idea
107 | weibo_images/
108 |
--------------------------------------------------------------------------------
/docs/README_CN.md:
--------------------------------------------------------------------------------
1 | # Weibo Image Spider
2 |
3 | 微博图片爬虫,极速下载、高清原图、多种命令、简单实用。
4 |
5 | ### 特点:
6 |
7 | - [x] 极速下载:多线程异步下载,可以根据需要设置线程数
8 | - [x] 异常重试:只要重试次数足够多,就没有下载不下来的图片 \(^o^)/!
9 | - [x] 增量下载:用户有新的上传,再跑一遍程序就行了 O(∩_∩)O 嗯!
10 | - [x] 高清原图:默认下载高清原图,可以使用参数 `--thumbnail` 下载缩略图(宽最大 690px)
11 |
12 | ### 环境:
13 |
14 | - `python3.6` 及以上
15 |
16 | # 快速使用
17 |
18 | ## 1. 克隆项目到本地
19 |
20 | ```sh
21 | $ git clone https://github.com/lonsty/weibo-image-spider.git
22 | ```
23 |
24 | ## 2. 安装依赖包
25 |
26 | ```sh
27 | $ cd weibo-image-spider
28 | $ pip install -r requirements.txt
29 | ```
30 |
31 | ## 3. 快速使用
32 |
33 | **注意**:
34 |
35 | *因网页版微博限制,使用爬虫请求其 API 时,需要 cookie 认证,关于 [如何获取 cookie](get_cookie.md)?
36 | 且 cookie 有效期为一天(第二天零点失效),所以最好不要跨天爬取。*
37 |
38 | 下载用户昵称为 `nickname` 的最新 2000(可使用 `-n` 修改) 张图片到路径 `dest` 下:
39 |
40 | ```sh
41 | $ python main.py -u -d
42 | ```
43 |
44 | 运行截图
45 |
46 | 
47 |
48 | 爬取结果
49 |
50 | 
51 |
52 | # 使用帮助
53 |
54 | ### 常用命令
55 |
56 | - 部分图片 **下载失败** 或 **微博有更新**,再执行相同的命令,对失败或新增的图片进行下载
57 |
58 | ```sh
59 | $ python main.py -u -d
60 | ```
61 |
62 | ### 查看所有命令
63 |
64 | ```
65 | $ python main.py --help
66 |
67 | Usage: main.py [OPTIONS]
68 |
69 | A Weibo image spider, visit https://github.com/lonsty/weibo-image-spider.
70 |
71 | Options:
72 | -u, --nickname TEXT Nickname
73 | -d, --destination TEXT Directory to save images [default:
74 | weibo_images/]
75 |
76 | -o, --overwrite Overwrite existing files [default: False]
77 | -t, --thumbnail Download thumbnails with a maximum width of 690px
78 | [default: False]
79 |
80 | -n, --max-images INTEGER Maximum number of images to download [default:
81 | 2000]
82 |
83 | -w, --max-workers INTEGER Maximum thread workers [default: 15]
84 | -P, --proxies TEXT Use proxies to access websites. Example:
85 | '{"http": "user:passwd@www.example.com:port",
86 | "https": "user:passwd@www.example.com:port"}'
87 |
88 | --help Show this message and exit.
89 | ```
90 |
91 | # 更新历史
92 |
93 | - ## Version 0.1.0a (2020-03-29)
94 |
95 | 主要功能:
96 |
97 | - 极速下载:多线程异步下载,可以根据需要设置线程数
98 | - 异常重试:只要重试次数足够多,就没有下载不下来的图片 \(^o^)/!
99 | - 增量下载:用户有新的上传,再跑一遍程序就行了 O(∩_∩)O 嗯!
100 | - 高清原图:默认下载高清原图,可以使用参数 `--thumbnail` 下载缩略图(宽最大 690px)
101 |
102 | # LICENSE
103 |
104 | 此项目使用 [MIT](LICENSE) 开源协议
105 |
106 | **注意**:使用此工具下载的所有内容,版权归原作者所有,请谨慎使用!
107 |
--------------------------------------------------------------------------------
/weibo_image_spider/constants.py:
--------------------------------------------------------------------------------
1 | # @AUTHOR : lonsty
2 | # @DATE : 2020/3/28 14:27
3 | import json
4 | import logging
5 | import os
6 | import random
7 | import re
8 | import time
9 | # from random import choice, random
10 | from typing import List
11 |
12 | from pydantic import BaseModel
13 | from weibo_image_spider.models import PhotoAPI, Status, User
14 | from weibo_image_spider.utils import convert_to_safe_filename, read_cookie
15 |
16 |
17 | class Constant(BaseModel):
18 | search_url: str = "https://s.weibo.com/user?q={user}&Refer=weibo_user"
19 | search_api: str = "https://s.weibo.com/ajax/topsuggest.php?key={user}&_k={ts}&_t=1&outjson=1&uid={uid}"
20 | img_hosts: List[str] = ["https://wx1.sinaimg.cn", "https://wx2.sinaimg.cn", "https://wx3.sinaimg.cn"]
21 | cookies_raw: str = ""
22 | user: User = User()
23 | photo_api: PhotoAPI = PhotoAPI()
24 | status: Status = Status()
25 | nickname: str = "lonsty"
26 | destination: str = "weibo_images"
27 | overwrite: bool = False
28 | thumbnail: bool = False
29 | max_images: int = 2000
30 | max_workers: int = 15
31 | proxies_raw: str = None
32 | timeout: int = 10
33 | cancel: bool = False
34 | end_crawler: bool = False
35 | verbose: bool = False
36 |
37 | def __init__(self, **kargs):
38 | super(Constant, self).__init__(**kargs)
39 | self.cookies_raw = read_cookie()
40 |
41 | @property
42 | def cookies(self):
43 | try:
44 | return dict([item.split("=")[0], item.split("=")[1]] for item in self.cookies_raw.split("; "))
45 | except Exception as e:
46 | logging.warning(e)
47 | return None
48 |
49 | @property
50 | def img_url_prefix(self):
51 | return f'{random.choice(self.img_hosts)}/{"large" if not self.thumbnail else "mw690"}/'
52 |
53 | @property
54 | def saved_dir(self):
55 | return os.path.join(os.path.abspath(self.destination), convert_to_safe_filename(self.user.name))
56 |
57 | @property
58 | def rex_pattern(self):
59 | return re.compile("(?<=/)\w*?\.(?:jpg|gif)", re.IGNORECASE)
60 |
61 | @property
62 | def user_photo_api(self):
63 | return self.photo_api.api
64 |
65 | @property
66 | def user_search_api(self):
67 | return self.search_api.format(
68 | user=self.nickname, ts=int(time.time() * 1000), uid=random.randrange(1_000_000_000, 9_999_999_999)
69 | )
70 |
71 | @property
72 | def proxies(self):
73 | if isinstance(self.proxies_raw, str):
74 | try:
75 | return json.loads(self.proxies_raw)
76 | except Exception as e:
77 | logging.warning(f"Proxy will not be used: {e}")
78 | return None
79 | return None
80 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # Weibo Image Spider
2 |
3 | 微博图片爬虫,极速下载、高清原图、多种命令、简单实用。
4 |
5 | ### 特点:
6 |
7 | - [x] 极速下载:多线程异步下载,可以根据需要设置线程数
8 | - [x] 异常重试:只要重试次数足够多,就没有下载不下来的图片 \(^o^)/!
9 | - [x] 增量下载:用户有新的上传,再跑一遍程序就行了 O(∩_∩)O 嗯!
10 | - [x] 高清原图:默认下载高清原图,可以使用参数 `--thumbnail` 下载缩略图(宽最大 690px)
11 |
12 | ### 环境:
13 |
14 | - `python3.6` 及以上
15 |
16 | # 快速使用
17 |
18 | ## 1. 克隆项目到本地
19 |
20 | ```sh
21 | $ git clone https://github.com/lonsty/weibo-image-spider.git
22 | ```
23 |
24 | ## 2. 安装依赖包
25 |
26 | ```sh
27 | $ cd weibo-image-spider
28 | $ pip install -r requirements.txt
29 | ```
30 |
31 | ## 3. 快速使用
32 |
33 | **注意**:
34 |
35 | *因网页版微博限制,使用爬虫请求其 API 时,需要 cookie 认证,关于 [如何获取 cookie](docs/get_cookie.md)?
36 | 且 cookie 有效期为一天(第二天零点失效),所以最好不要跨天爬取。*
37 |
38 | 下载用户昵称或用户ID 为 `nickname`(或 `user-id`) 的最新 2000(可使用 `-n` 修改) 张图片到路径 `dest` 下:
39 |
40 | ```sh
41 | $ python main.py -u -d
42 | ```
43 |
44 | 运行截图
45 |
46 | 
47 |
48 | 爬取结果
49 |
50 | 
51 |
52 | # 使用帮助
53 |
54 | ### 常用命令
55 |
56 | - 部分图片 **下载失败** 或 **微博有更新**,再执行相同的命令,对失败或新增的图片进行下载
57 |
58 | ```sh
59 | $ python main.py -u -d
60 | ```
61 |
62 | ### 查看所有命令
63 |
64 | ```
65 | $ python main.py --help
66 |
67 | Usage: main.py [OPTIONS]
68 |
69 | A Weibo image spider, visit https://github.com/lonsty/weibo-image-spider.
70 |
71 | Options:
72 | -u, --nickname, --user-id TEXT Nickname or User ID
73 | -d, --destination TEXT Directory to save images [default:
74 | weibo_images/]
75 |
76 | -o, --overwrite Overwrite existing files [default: False]
77 | -t, --thumbnail Download thumbnails with a maximum width of
78 | 690px [default: False]
79 |
80 | -n, --max-images INTEGER Maximum number of images to download
81 | [default: 2000]
82 |
83 | -w, --max-workers INTEGER Maximum thread workers [default: 15]
84 | -P, --proxies TEXT Use proxies to access websites. Example:
85 | '{"http":
86 | "user:passwd@www.example.com:port", "https":
87 | "user:passwd@www.example.com:port"}'
88 |
89 | --help Show this message and exit.
90 |
91 | ```
92 |
93 | # 更新历史
94 |
95 | - ## Version 0.1.2 (2021-11-13)
96 |
97 | - 修复查询用户信息失败,导致无法继续下载的问题
98 |
99 | - ## Version 0.1.1 (2021-08-26)
100 |
101 | 新功能:
102 |
103 | - 支持使用用户 ID(User ID)下载:`python main.py -u `
104 |
105 | - ## Version 0.1.0 (2021-05-16)
106 |
107 | - 调整代码结构
108 | - 修复偶发的图片下载不完整的问题
109 | - 修复下载总量与给定量不一致的问题
110 |
111 | - ## Version 0.1.0a (2020-03-29)
112 |
113 | 主要功能:
114 |
115 | - 极速下载:多线程异步下载,可以根据需要设置线程数
116 | - 异常重试:只要重试次数足够多,就没有下载不下来的图片 \(^o^)/!
117 | - 增量下载:用户有新的上传,再跑一遍程序就行了 O(∩_∩)O 嗯!
118 | - 高清原图:默认下载高清原图,可以使用参数 `--thumbnail` 下载缩略图(宽最大 690px)
119 |
120 | # LICENSE
121 |
122 | 此项目使用 [MIT](LICENSE) 开源协议
123 |
124 | **注意**:使用此工具下载的所有内容,版权归原作者所有,请谨慎使用!
125 |
--------------------------------------------------------------------------------
/weibo_image_spider/utils.py:
--------------------------------------------------------------------------------
1 | # @AUTHOR : lonsty
2 | # @DATE : 2020/3/28 14:23
3 | import json
4 | import os
5 | import random
6 | import sys
7 | import threading
8 | import time
9 | from functools import wraps
10 |
11 | from requests import Session
12 |
13 | thread_local = threading.local()
14 |
15 |
16 | def cookies_from_raw(raw):
17 | return dict([line.split("=")[0], line.split("=")[1]] for line in raw.split("; "))
18 |
19 |
20 | def get_session():
21 | if not hasattr(thread_local, "session"):
22 | thread_local.session = Session()
23 | return thread_local.session
24 |
25 |
26 | def retry(exceptions=Exception, tries=3, delay=1, backoff=2, logger=None):
27 | """
28 | Retry calling the decorated function using an exponential backoff.
29 | Args:
30 | exceptions: The exception to check. may be a tuple of
31 | exceptions to check.
32 | tries: Number of times to try (not retry) before giving up.
33 | delay: Initial delay between retries in seconds.
34 | backoff: Backoff multiplier (e.g. value of 2 will double the delay
35 | each retry).
36 | logger: Logger to use. If None, print.
37 | """
38 |
39 | def deco_retry(f):
40 | @wraps(f)
41 | def f_retry(*args, **kwargs):
42 | mtries, mdelay = tries, delay or random.uniform(0.5, 1.5)
43 | while mtries > 1:
44 | try:
45 | return f(*args, **kwargs)
46 | except exceptions as e:
47 | if logger:
48 | logger.error("{}, Retrying in {} seconds...".format(e, mdelay))
49 | else:
50 | print("\n{}, Retrying in {} seconds...".format(e, mdelay))
51 | time.sleep(mdelay)
52 | mtries -= 1
53 | mdelay *= backoff
54 | return f(*args, **kwargs)
55 |
56 | return f_retry
57 |
58 | return deco_retry
59 |
60 |
61 | def mkdirs_if_not_exist(dir):
62 | if not os.path.isdir(dir):
63 | try:
64 | os.makedirs(dir)
65 | except FileExistsError:
66 | pass
67 |
68 |
69 | def convert_to_safe_filename(filename):
70 | return "".join([c for c in filename if c not in r'\/:*?"<>|']).strip()
71 |
72 |
73 | def read_cookie():
74 | with open("cookie", "r") as f:
75 | return f.read().strip()
76 |
77 |
78 | def save_cookie(cookie):
79 | with open("cookie", "w") as f:
80 | f.write(cookie)
81 |
82 |
83 | def quit(msg, code=0):
84 | print(msg)
85 | sys.exit(code)
86 |
87 |
88 | def save_records(c):
89 | filename = os.path.join(c.saved_dir, c.status.start_time.strftime("%Y-%m-%d_%H-%M-%S") + ".json")
90 | with open(filename, "w") as f:
91 | f.write(
92 | json.dumps(
93 | {
94 | "nickname": c.user.name,
95 | "uid": c.user.uid,
96 | "datetime": c.status.start_time_repr,
97 | "succeed": {"count": len(c.status.succeed), "urls": c.status.succeed},
98 | "failed": {"count": len(c.status.failed), "urls": c.status.failed},
99 | },
100 | ensure_ascii=False,
101 | indent=2,
102 | )
103 | )
104 |
--------------------------------------------------------------------------------
/weibo_image_spider/cli.py:
--------------------------------------------------------------------------------
1 | # @AUTHOR : lonsty
2 | # @DATE : 2020/3/28 18:46
3 | import json
4 | import logging
5 | from concurrent.futures import ThreadPoolExecutor, wait
6 |
7 | import click
8 | from pydantic import ValidationError
9 | from requests.exceptions import ConnectionError, RequestException
10 | from termcolor import colored
11 |
12 | from weibo_image_spider.constants import Constant
13 | from weibo_image_spider.models import Parameters, PhotoAPI
14 | from weibo_image_spider.spider_workers import crawl_worker, download_worker, query_user_by_name
15 | from weibo_image_spider.utils import mkdirs_if_not_exist, quit, save_records
16 |
17 |
18 | @click.command(help="A Weibo image spider, visit https://github.com/lonsty/weibo-image-spider.")
19 | @click.option("-u", "--nickname", "nickname", help="Nickname")
20 | @click.option(
21 | "-d", "--destination", "destination", default="weibo_images/", show_default=True, help="Directory to save images"
22 | )
23 | @click.option(
24 | "-o", "--overwrite", "overwrite", is_flag=True, default=False, show_default=True, help="Overwrite existing files"
25 | )
26 | @click.option(
27 | "-t",
28 | "--thumbnail",
29 | "thumbnail",
30 | is_flag=True,
31 | default=False,
32 | show_default=True,
33 | help="Download thumbnails with a maximum width of 690px",
34 | )
35 | @click.option(
36 | "-n",
37 | "--max-images",
38 | "max_images",
39 | default=2000,
40 | show_default=True,
41 | type=int,
42 | help="Maximum number of images to download",
43 | )
44 | @click.option(
45 | "-w", "--max-workers", "max_workers", default=15, show_default=True, type=int, help="Maximum thread workers"
46 | )
47 | @click.option(
48 | "-P",
49 | "--proxies",
50 | "proxies_raw",
51 | help="Use proxies to access websites.\nExample:\n'"
52 | '{"http": "user:password@example.com:port",\n'
53 | '"https": "user:password@example.com:port"}\'',
54 | )
55 | @click.option(
56 | "-v",
57 | "--verbose",
58 | "verbose",
59 | is_flag=True,
60 | help="Show more information for debugging",
61 | )
62 | def weibo_command(**kwargs):
63 | try:
64 | paras = Parameters(**kwargs)
65 | const = Constant(**paras.dict())
66 | except ValidationError as e:
67 | quit("Invalid arguments: " + ", ".join([f'{a["loc"][0]} - {a["msg"]}' for a in json.loads(e.json())]), 1)
68 |
69 | logging.basicConfig(
70 | level=logging.INFO if const.verbose else logging.ERROR,
71 | format="[%(asctime)s %(threadName)-23s %(levelname)-5s %(lineno)3d] %(message)s",
72 | )
73 |
74 | try:
75 | const.user = query_user_by_name(const)
76 | except (ConnectionError, RequestException) as e:
77 | quit(f"Network error: {e}", 1)
78 |
79 | mkdirs_if_not_exist(const.saved_dir)
80 | print(
81 | f"\n - - - - - -+-+ {const.status.start_time_repr} +-+- - - - - -\n"
82 | f' Nickname: {colored(const.user.name, "cyan")}\n'
83 | f' User ID: {colored(const.user.uid, "cyan")}\n'
84 | f'Destination: {colored(const.saved_dir, attrs=["underline"])}\n'
85 | f" Overwrite: {const.overwrite}\n"
86 | f" Thumbnail: {const.thumbnail}\n"
87 | f" Max images: {const.max_images}\n"
88 | )
89 |
90 | const.photo_api = PhotoAPI(
91 | action_data=f"type=photo&owner_uid={const.user.uid}&viewer_uid={const.user.uid}" f"&since_id=-1",
92 | page_id=int(f"100505{const.user.uid}"),
93 | page=1,
94 | )
95 |
96 | with ThreadPoolExecutor(max_workers=const.max_workers + 1) as pool:
97 | img_crawler = pool.submit(crawl_worker, const)
98 | img_downloader = [pool.submit(download_worker, const) for _ in range(const.max_workers)]
99 | wait([img_crawler] + img_downloader)
100 |
101 | save_records(const)
102 | quit("\n\nDownload completed, bye bye ~")
103 |
--------------------------------------------------------------------------------
/Pipfile.lock:
--------------------------------------------------------------------------------
1 | {
2 | "_meta": {
3 | "hash": {
4 | "sha256": "29cab4f0b5fdb1c0d9a58d4d31abc8c8a71a3cd23750e9c65bd93b0ce19f0728"
5 | },
6 | "pipfile-spec": 6,
7 | "requires": {
8 | "python_version": "3.6"
9 | },
10 | "sources": [
11 | {
12 | "name": "pypi",
13 | "url": "https://pypi.org/simple",
14 | "verify_ssl": true
15 | }
16 | ]
17 | },
18 | "default": {
19 | "beautifulsoup4": {
20 | "hashes": [
21 | "sha256:05fd825eb01c290877657a56df4c6e4c311b3965bda790c613a3d6fb01a5462a",
22 | "sha256:9fbb4d6e48ecd30bcacc5b63b94088192dcda178513b2ae3c394229f8911b887",
23 | "sha256:e1505eeed31b0f4ce2dbb3bc8eb256c04cc2b3b72af7d551a4ab6efd5cbe5dae"
24 | ],
25 | "index": "pypi",
26 | "version": "==4.8.2"
27 | },
28 | "certifi": {
29 | "hashes": [
30 | "sha256:017c25db2a153ce562900032d5bc68e9f191e44e9a0f762f373977de9df1fbb3",
31 | "sha256:25b64c7da4cd7479594d035c08c2d809eb4aab3a26e5a990ea98cc450c320f1f"
32 | ],
33 | "version": "==2019.11.28"
34 | },
35 | "chardet": {
36 | "hashes": [
37 | "sha256:84ab92ed1c4d4f16916e05906b6b75a6c0fb5db821cc65e70cbd64a3e2a5eaae",
38 | "sha256:fc323ffcaeaed0e0a02bf4d117757b98aed530d9ed4531e3e15460124c106691"
39 | ],
40 | "version": "==3.0.4"
41 | },
42 | "click": {
43 | "hashes": [
44 | "sha256:8a18b4ea89d8820c5d0c7da8a64b2c324b4dabb695804dbfea19b9be9d88c0cc",
45 | "sha256:e345d143d80bf5ee7534056164e5e112ea5e22716bbb1ce727941f4c8b471b9a"
46 | ],
47 | "index": "pypi",
48 | "version": "==7.1.1"
49 | },
50 | "dataclasses": {
51 | "hashes": [
52 | "sha256:3459118f7ede7c8bea0fe795bff7c6c2ce287d01dd226202f7c9ebc0610a7836",
53 | "sha256:494a6dcae3b8bcf80848eea2ef64c0cc5cd307ffc263e17cdf42f3e5420808e6"
54 | ],
55 | "markers": "python_version < '3.7'",
56 | "version": "==0.7"
57 | },
58 | "idna": {
59 | "hashes": [
60 | "sha256:7588d1c14ae4c77d74036e8c22ff447b26d0fde8f007354fd48a7814db15b7cb",
61 | "sha256:a068a21ceac8a4d63dbfd964670474107f541babbd2250d61922f029858365fa"
62 | ],
63 | "version": "==2.9"
64 | },
65 | "pydantic": {
66 | "hashes": [
67 | "sha256:012c422859bac2e03ab3151ea6624fecf0e249486be7eb8c6ee69c91740c6752",
68 | "sha256:07911aab70f3bc52bb845ce1748569c5e70478ac977e106a150dd9d0465ebf04",
69 | "sha256:47b8db7024ba3d46c3d4768535e1cf87b6c8cf92ccd81e76f4e1cb8ee47688b3",
70 | "sha256:50e4e948892a6815649ad5a9a9379ad1e5f090f17842ac206535dfaed75c6f2f",
71 | "sha256:51f11c8bbf794a68086540da099aae4a9107447c7a9d63151edbb7d50110cf21",
72 | "sha256:6100d7862371115c40be55cc4b8d766a74b1d0dbaf99dbfe72bb4bac0faf89ed",
73 | "sha256:61d22d36808087d3184ed6ac0d91dd71c533b66addb02e4a9930e1e30833202f",
74 | "sha256:72184c1421103cca128300120f8f1185fb42a9ea73a1c9845b1c53db8c026a7d",
75 | "sha256:831a0265a9e3933b3d0f04d1a81bba543bafbe4119c183ff2771871db70524ab",
76 | "sha256:8848b4eb458469739126e4c1a202d723dd092e087f8dbe3104371335f87ba5df",
77 | "sha256:bbbed364376f4a0aebb9ea452ff7968b306499a9e74f4db69b28ff2cd4043a11",
78 | "sha256:e27559cedbd7f59d2375bfd6eea29a330ea1a5b0589c34d6b4e0d7bec6027bbf",
79 | "sha256:f17ec336e64d4583311249fb179528e9a2c27c8a2eaf590ec6ec2c6dece7cb3f",
80 | "sha256:f863456d3d4bf817f2e5248553dee3974c5dc796f48e6ddb599383570f4215ac"
81 | ],
82 | "index": "pypi",
83 | "version": "==1.4"
84 | },
85 | "requests": {
86 | "hashes": [
87 | "sha256:43999036bfa82904b6af1d99e4882b560e5e2c68e5c4b0aa03b655f3d7d73fee",
88 | "sha256:b3f43d496c6daba4493e7c431722aeb7dbc6288f52a6e04e7b6023b0247817e6"
89 | ],
90 | "index": "pypi",
91 | "version": "==2.23.0"
92 | },
93 | "soupsieve": {
94 | "hashes": [
95 | "sha256:e914534802d7ffd233242b785229d5ba0766a7f487385e3f714446a07bf540ae",
96 | "sha256:fcd71e08c0aee99aca1b73f45478549ee7e7fc006d51b37bec9e9def7dc22b69"
97 | ],
98 | "version": "==2.0"
99 | },
100 | "termcolor": {
101 | "hashes": [
102 | "sha256:1d6d69ce66211143803fbc56652b41d73b4a400a2891d7bf7a1cdf4c02de613b"
103 | ],
104 | "index": "pypi",
105 | "version": "==1.1.0"
106 | },
107 | "urllib3": {
108 | "hashes": [
109 | "sha256:2f3db8b19923a873b3e5256dc9c2dedfa883e33d87c690d9c7913e1f40673cdc",
110 | "sha256:87716c2d2a7121198ebcb7ce7cccf6ce5e9ba539041cfbaeecfb641dc0bf6acc"
111 | ],
112 | "version": "==1.25.8"
113 | }
114 | },
115 | "develop": {}
116 | }
117 |
--------------------------------------------------------------------------------
/weibo_image_spider/spider_workers.py:
--------------------------------------------------------------------------------
1 | # @AUTHOR : lonsty
2 | # @DATE : 2020/3/28 14:24
3 | import logging
4 | import os
5 | import queue
6 | import threading
7 |
8 | from bs4 import BeautifulSoup
9 | from requests import Session
10 | from requests.exceptions import ConnectionError, RequestException
11 | from termcolor import colored
12 |
13 | from .constants import Constant
14 | from .exceptions import ContentParserError, CookiesExpiredException, NoImagesException, UserNotFound
15 | from .models import User, appointment_jobs, downloading_jobs
16 | from .utils import get_session, retry, save_cookie
17 |
18 | lock = threading.RLock()
19 |
20 |
21 | @retry(logger=logging)
22 | def query_user_by_name(const: Constant):
23 | session = get_session()
24 |
25 | try:
26 | logging.info(f"Getting information of username: {const.nickname}...")
27 | resp = session.get(const.user_search_api, cookies=const.cookies, proxies=const.proxies, timeout=const.timeout)
28 | resp.raise_for_status()
29 | except Exception as e:
30 | logging.info(f"Getting user information error: {e}")
31 | raise ConnectionError(e)
32 |
33 | try:
34 | logging.info("Initialing a BeautifulSoup...")
35 | first = resp.json()["user"][0]
36 | name = first["u_name"]
37 | uid = first["u_id"]
38 | except (KeyError, IndexError) as e:
39 | logging.info(f"Parsing user information error: {e}")
40 | raise ContentParserError(
41 | "Weibo API updated, please add a issue " "to https://github.com/lonsty/weibo-image-spider/issues."
42 | )
43 | user = User(name=name, host=f"https://weibo.com/u/{uid}", uid=uid)
44 | logging.info(f"Got information of username: {const.nickname}, {user}")
45 |
46 | return user
47 |
48 |
49 | @retry((RequestException, CookiesExpiredException), logger=logging)
50 | def crawl_image(const: Constant, url: str, session: Session):
51 | try:
52 | logging.info(f"Getting urls from page...")
53 | resp = session.get(url, cookies=const.cookies, proxies=const.proxies, timeout=const.timeout)
54 | resp.raise_for_status()
55 | except Exception as e:
56 | logging.info(f"Getting urls from page error: {e}")
57 | raise RequestException(e)
58 |
59 | try:
60 | logging.info(f"Parsing urls from page...")
61 | soup = BeautifulSoup(resp.json().get("data"), "html.parser")
62 | boxes = soup.find_all("a", class_="ph_ar_box")
63 | for box in boxes:
64 | img = const.rex_pattern.search(box.find("img").get("src")).group(0)
65 | downloading_jobs.put(img)
66 | logging.info(f"Parsed {len(boxes)} urls from page")
67 | except Exception as e:
68 | logging.info(f"Parsing urls from page error: {e}")
69 | raise CookiesExpiredException("Cookie has expired, please get a new one and paste to here:\n")
70 |
71 | logging.info(f"Parsing action-data from page...")
72 | card = soup.find("div", class_="WB_cardwrap")
73 | if not card:
74 | logging.info(f"No action-data in page")
75 | raise NoImagesException("No more images to crawl")
76 |
77 | action_data = card.get("action-data")
78 | const.photo_api.action_data = action_data
79 | logging.info(f"Got action-data from page: {action_data}")
80 |
81 |
82 | def crawl_worker(const: Constant):
83 | page = 1
84 | session = get_session()
85 |
86 | while appointment_jobs.qsize() < const.max_images:
87 | const.photo_api.page = page
88 | try:
89 | logging.info(f"Crawling page {page}...")
90 | crawl_image(const, const.user_photo_api, session)
91 | logging.info(f"Crawled page {page}")
92 | except CookiesExpiredException as e:
93 | logging.info(f"Cookies has expired, need new cookies")
94 | const.cookies_raw = input(str(e))
95 | save_cookie(const.cookies_raw)
96 | logging.info(f"Saved new cookies")
97 | continue
98 | except (NoImagesException, Exception) as e:
99 | logging.info(f"Crawling page: {e}")
100 | break
101 | page += 1
102 | const.end_crawler = True
103 |
104 |
105 | @retry(logger=logging)
106 | def download_image(const: Constant, img: str, session: Session):
107 | url = const.img_url_prefix + img
108 | filename = os.path.join(const.saved_dir, img)
109 |
110 | if (not const.overwrite) and os.path.isfile(filename):
111 | logging.info(f"Skipped downloaded image: {filename}")
112 | return url
113 |
114 | try:
115 | logging.info(f"Heading image...")
116 | head = session.get(url, cookies=const.cookies, proxies=const.proxies, timeout=const.timeout)
117 | head.raise_for_status()
118 | image_size = int(head.headers["Content-Length"].strip())
119 | logging.info(f"Got image: {url} size: {image_size}")
120 | except Exception as e:
121 | logging.info(f"Heading image error: {e}")
122 | raise RequestException(e)
123 |
124 | try:
125 | logging.info(f"Downloading image...")
126 | resp = session.get(url, cookies=const.cookies, proxies=const.proxies, stream=True, timeout=const.timeout)
127 | resp.raise_for_status()
128 | logging.info(f"Downloaded image")
129 | except Exception as e:
130 | logging.info(f"Downloading image error: {e}")
131 | raise RequestException(e)
132 |
133 | write_size = 0
134 | with open(filename, "wb") as f:
135 | for chunk in resp.iter_content(chunk_size=8192):
136 | f.write(chunk)
137 | write_size += len(chunk)
138 |
139 | if write_size < image_size:
140 | os.remove(filename)
141 | logging.info(f"Saving image error: image is incomplete")
142 | raise RequestException("The downloaded image is incomplete")
143 | logging.info(f"Saved image: {filename}")
144 |
145 | return url
146 |
147 |
148 | def download_worker(const: Constant):
149 | session = get_session()
150 |
151 | while appointment_jobs.qsize() < const.max_images:
152 | try:
153 | img = downloading_jobs.get_nowait()
154 | with lock:
155 | if appointment_jobs.qsize() < const.max_images:
156 | appointment_jobs.put(img)
157 | else:
158 | break
159 | logging.info(f"Download worker start...")
160 | result = download_image(const, img, session)
161 | except queue.Empty:
162 | if const.cancel or const.end_crawler:
163 | break
164 | except Exception as e:
165 | logging.info(f"Download worker error: {e}")
166 | result = const.img_url_prefix + img
167 | const.status.failed.append(result)
168 | print(
169 | f'{colored("[x]", "red", attrs=["reverse"])} {colored(result, attrs=["underline"])}\t'
170 | f"{const.status.fmt_status}",
171 | end="\r" if not const.verbose else "\n",
172 | flush=True,
173 | )
174 | else:
175 | logging.info(f"Download worker succeed")
176 | const.status.succeed.append(result)
177 | print(
178 | f'{colored("[√]", "green", attrs=["reverse"])} {colored(result, attrs=["underline"])}\t'
179 | f"{const.status.fmt_status}",
180 | end="\r" if not const.verbose else "\n",
181 | flush=True,
182 | )
183 |
--------------------------------------------------------------------------------