├── .gitignore ├── LICENSE ├── Pipfile ├── Pipfile.lock ├── README.md ├── docs ├── README_CN.md ├── get_cookie.md ├── get_cookie.png ├── screenshot_1.png └── screenshot_2.png ├── main.py ├── requirements.txt └── weibo_image_spider ├── __init__.py ├── cli.py ├── constants.py ├── exceptions.py ├── models.py ├── spider_workers.py └── utils.py /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | *.egg-info/ 24 | .installed.cfg 25 | *.egg 26 | MANIFEST 27 | 28 | # PyInstaller 29 | # Usually these files are written by a python script from a template 30 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 31 | *.manifest 32 | *.spec 33 | 34 | # Installer logs 35 | pip-log.txt 36 | pip-delete-this-directory.txt 37 | 38 | # Unit test / coverage reports 39 | htmlcov/ 40 | .tox/ 41 | .coverage 42 | .coverage.* 43 | .cache 44 | nosetests.xml 45 | coverage.xml 46 | *.cover 47 | .hypothesis/ 48 | .pytest_cache/ 49 | 50 | # Translations 51 | *.mo 52 | *.pot 53 | 54 | # Django stuff: 55 | *.log 56 | local_settings.py 57 | db.sqlite3 58 | 59 | # Flask stuff: 60 | instance/ 61 | .webassets-cache 62 | 63 | # Scrapy stuff: 64 | .scrapy 65 | 66 | # Sphinx documentation 67 | docs/_build/ 68 | 69 | # PyBuilder 70 | target/ 71 | 72 | # Jupyter Notebook 73 | .ipynb_checkpoints 74 | 75 | # pyenv 76 | .python-version 77 | 78 | # celery beat schedule file 79 | celerybeat-schedule 80 | 81 | # SageMath parsed files 82 | *.sage.py 83 | 84 | # Environments 85 | .env 86 | .venv 87 | env/ 88 | venv/ 89 | ENV/ 90 | env.bak/ 91 | venv.bak/ 92 | 93 | # Spyder project settings 94 | .spyderproject 95 | .spyproject 96 | 97 | # Rope project settings 98 | .ropeproject 99 | 100 | # mkdocs documentation 101 | /site 102 | 103 | # mypy 104 | .mypy_cache/ 105 | 106 | .idea 107 | weibo_images/ 108 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2020 Allen Shaw 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /Pipfile: -------------------------------------------------------------------------------- 1 | [[source]] 2 | name = "pypi" 3 | url = "https://pypi.org/simple" 4 | verify_ssl = true 5 | 6 | [dev-packages] 7 | 8 | [packages] 9 | beautifulsoup4 = "==4.8.2" 10 | click = "==7.1.1" 11 | termcolor = "==1.1.0" 12 | requests = "==2.23.0" 13 | pydantic = "==1.4" 14 | 15 | [requires] 16 | python_version = "3.6" 17 | -------------------------------------------------------------------------------- /Pipfile.lock: -------------------------------------------------------------------------------- 1 | { 2 | "_meta": { 3 | "hash": { 4 | "sha256": "29cab4f0b5fdb1c0d9a58d4d31abc8c8a71a3cd23750e9c65bd93b0ce19f0728" 5 | }, 6 | "pipfile-spec": 6, 7 | "requires": { 8 | "python_version": "3.6" 9 | }, 10 | "sources": [ 11 | { 12 | "name": "pypi", 13 | "url": "https://pypi.org/simple", 14 | "verify_ssl": true 15 | } 16 | ] 17 | }, 18 | "default": { 19 | "beautifulsoup4": { 20 | "hashes": [ 21 | "sha256:05fd825eb01c290877657a56df4c6e4c311b3965bda790c613a3d6fb01a5462a", 22 | "sha256:9fbb4d6e48ecd30bcacc5b63b94088192dcda178513b2ae3c394229f8911b887", 23 | "sha256:e1505eeed31b0f4ce2dbb3bc8eb256c04cc2b3b72af7d551a4ab6efd5cbe5dae" 24 | ], 25 | "index": "pypi", 26 | "version": "==4.8.2" 27 | }, 28 | "certifi": { 29 | "hashes": [ 30 | "sha256:017c25db2a153ce562900032d5bc68e9f191e44e9a0f762f373977de9df1fbb3", 31 | "sha256:25b64c7da4cd7479594d035c08c2d809eb4aab3a26e5a990ea98cc450c320f1f" 32 | ], 33 | "version": "==2019.11.28" 34 | }, 35 | "chardet": { 36 | "hashes": [ 37 | "sha256:84ab92ed1c4d4f16916e05906b6b75a6c0fb5db821cc65e70cbd64a3e2a5eaae", 38 | "sha256:fc323ffcaeaed0e0a02bf4d117757b98aed530d9ed4531e3e15460124c106691" 39 | ], 40 | "version": "==3.0.4" 41 | }, 42 | "click": { 43 | "hashes": [ 44 | "sha256:8a18b4ea89d8820c5d0c7da8a64b2c324b4dabb695804dbfea19b9be9d88c0cc", 45 | "sha256:e345d143d80bf5ee7534056164e5e112ea5e22716bbb1ce727941f4c8b471b9a" 46 | ], 47 | "index": "pypi", 48 | "version": "==7.1.1" 49 | }, 50 | "dataclasses": { 51 | "hashes": [ 52 | "sha256:3459118f7ede7c8bea0fe795bff7c6c2ce287d01dd226202f7c9ebc0610a7836", 53 | "sha256:494a6dcae3b8bcf80848eea2ef64c0cc5cd307ffc263e17cdf42f3e5420808e6" 54 | ], 55 | "markers": "python_version < '3.7'", 56 | "version": "==0.7" 57 | }, 58 | "idna": { 59 | "hashes": [ 60 | "sha256:7588d1c14ae4c77d74036e8c22ff447b26d0fde8f007354fd48a7814db15b7cb", 61 | "sha256:a068a21ceac8a4d63dbfd964670474107f541babbd2250d61922f029858365fa" 62 | ], 63 | "version": "==2.9" 64 | }, 65 | "pydantic": { 66 | "hashes": [ 67 | "sha256:012c422859bac2e03ab3151ea6624fecf0e249486be7eb8c6ee69c91740c6752", 68 | "sha256:07911aab70f3bc52bb845ce1748569c5e70478ac977e106a150dd9d0465ebf04", 69 | "sha256:47b8db7024ba3d46c3d4768535e1cf87b6c8cf92ccd81e76f4e1cb8ee47688b3", 70 | "sha256:50e4e948892a6815649ad5a9a9379ad1e5f090f17842ac206535dfaed75c6f2f", 71 | "sha256:51f11c8bbf794a68086540da099aae4a9107447c7a9d63151edbb7d50110cf21", 72 | "sha256:6100d7862371115c40be55cc4b8d766a74b1d0dbaf99dbfe72bb4bac0faf89ed", 73 | "sha256:61d22d36808087d3184ed6ac0d91dd71c533b66addb02e4a9930e1e30833202f", 74 | "sha256:72184c1421103cca128300120f8f1185fb42a9ea73a1c9845b1c53db8c026a7d", 75 | "sha256:831a0265a9e3933b3d0f04d1a81bba543bafbe4119c183ff2771871db70524ab", 76 | "sha256:8848b4eb458469739126e4c1a202d723dd092e087f8dbe3104371335f87ba5df", 77 | "sha256:bbbed364376f4a0aebb9ea452ff7968b306499a9e74f4db69b28ff2cd4043a11", 78 | "sha256:e27559cedbd7f59d2375bfd6eea29a330ea1a5b0589c34d6b4e0d7bec6027bbf", 79 | "sha256:f17ec336e64d4583311249fb179528e9a2c27c8a2eaf590ec6ec2c6dece7cb3f", 80 | "sha256:f863456d3d4bf817f2e5248553dee3974c5dc796f48e6ddb599383570f4215ac" 81 | ], 82 | "index": "pypi", 83 | "version": "==1.4" 84 | }, 85 | "requests": { 86 | "hashes": [ 87 | "sha256:43999036bfa82904b6af1d99e4882b560e5e2c68e5c4b0aa03b655f3d7d73fee", 88 | "sha256:b3f43d496c6daba4493e7c431722aeb7dbc6288f52a6e04e7b6023b0247817e6" 89 | ], 90 | "index": "pypi", 91 | "version": "==2.23.0" 92 | }, 93 | "soupsieve": { 94 | "hashes": [ 95 | "sha256:e914534802d7ffd233242b785229d5ba0766a7f487385e3f714446a07bf540ae", 96 | "sha256:fcd71e08c0aee99aca1b73f45478549ee7e7fc006d51b37bec9e9def7dc22b69" 97 | ], 98 | "version": "==2.0" 99 | }, 100 | "termcolor": { 101 | "hashes": [ 102 | "sha256:1d6d69ce66211143803fbc56652b41d73b4a400a2891d7bf7a1cdf4c02de613b" 103 | ], 104 | "index": "pypi", 105 | "version": "==1.1.0" 106 | }, 107 | "urllib3": { 108 | "hashes": [ 109 | "sha256:2f3db8b19923a873b3e5256dc9c2dedfa883e33d87c690d9c7913e1f40673cdc", 110 | "sha256:87716c2d2a7121198ebcb7ce7cccf6ce5e9ba539041cfbaeecfb641dc0bf6acc" 111 | ], 112 | "version": "==1.25.8" 113 | } 114 | }, 115 | "develop": {} 116 | } 117 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Weibo Image Spider 2 | 3 | 微博图片爬虫,极速下载、高清原图、多种命令、简单实用。 4 | 5 | ### 特点: 6 | 7 | - [x] 极速下载:多线程异步下载,可以根据需要设置线程数 8 | - [x] 异常重试:只要重试次数足够多,就没有下载不下来的图片 \(^o^)/! 9 | - [x] 增量下载:用户有新的上传,再跑一遍程序就行了 O(∩_∩)O 嗯! 10 | - [x] 高清原图:默认下载高清原图,可以使用参数 `--thumbnail` 下载缩略图(宽最大 690px) 11 | 12 | ### 环境: 13 | 14 | - `python3.6` 及以上 15 | 16 | # 快速使用 17 | 18 | ## 1. 克隆项目到本地 19 | 20 | ```sh 21 | $ git clone https://github.com/lonsty/weibo-image-spider.git 22 | ``` 23 | 24 | ## 2. 安装依赖包 25 | 26 | ```sh 27 | $ cd weibo-image-spider 28 | $ pip install -r requirements.txt 29 | ``` 30 | 31 | ## 3. 快速使用 32 | 33 | **注意**: 34 | 35 | *因网页版微博限制,使用爬虫请求其 API 时,需要 cookie 认证,关于 [如何获取 cookie](docs/get_cookie.md)? 36 | 且 cookie 有效期为一天(第二天零点失效),所以最好不要跨天爬取。* 37 | 38 | 下载用户昵称或用户ID 为 `nickname`(或 `user-id`) 的最新 2000(可使用 `-n` 修改) 张图片到路径 `dest` 下: 39 | 40 | ```sh 41 | $ python main.py -u -d 42 | ``` 43 | 44 | 运行截图 45 | 46 | ![screenshot_1.png](docs/screenshot_1.png) 47 | 48 | 爬取结果 49 | 50 | ![screenshot_2.png](docs/screenshot_2.png) 51 | 52 | # 使用帮助 53 | 54 | ### 常用命令 55 | 56 | - 部分图片 **下载失败** 或 **微博有更新**,再执行相同的命令,对失败或新增的图片进行下载 57 | 58 | ```sh 59 | $ python main.py -u -d 60 | ``` 61 | 62 | ### 查看所有命令 63 | 64 | ``` 65 | $ python main.py --help 66 | 67 | Usage: main.py [OPTIONS] 68 | 69 | A Weibo image spider, visit https://github.com/lonsty/weibo-image-spider. 70 | 71 | Options: 72 | -u, --nickname, --user-id TEXT Nickname or User ID 73 | -d, --destination TEXT Directory to save images [default: 74 | weibo_images/] 75 | 76 | -o, --overwrite Overwrite existing files [default: False] 77 | -t, --thumbnail Download thumbnails with a maximum width of 78 | 690px [default: False] 79 | 80 | -n, --max-images INTEGER Maximum number of images to download 81 | [default: 2000] 82 | 83 | -w, --max-workers INTEGER Maximum thread workers [default: 15] 84 | -P, --proxies TEXT Use proxies to access websites. Example: 85 | '{"http": 86 | "user:passwd@www.example.com:port", "https": 87 | "user:passwd@www.example.com:port"}' 88 | 89 | --help Show this message and exit. 90 | 91 | ``` 92 | 93 | # 更新历史 94 | 95 | - ## Version 0.1.2 (2021-11-13) 96 | 97 | - 修复查询用户信息失败,导致无法继续下载的问题 98 | 99 | - ## Version 0.1.1 (2021-08-26) 100 | 101 | 新功能: 102 | 103 | - 支持使用用户 ID(User ID)下载:`python main.py -u ` 104 | 105 | - ## Version 0.1.0 (2021-05-16) 106 | 107 | - 调整代码结构 108 | - 修复偶发的图片下载不完整的问题 109 | - 修复下载总量与给定量不一致的问题 110 | 111 | - ## Version 0.1.0a (2020-03-29) 112 | 113 | 主要功能: 114 | 115 | - 极速下载:多线程异步下载,可以根据需要设置线程数 116 | - 异常重试:只要重试次数足够多,就没有下载不下来的图片 \(^o^)/! 117 | - 增量下载:用户有新的上传,再跑一遍程序就行了 O(∩_∩)O 嗯! 118 | - 高清原图:默认下载高清原图,可以使用参数 `--thumbnail` 下载缩略图(宽最大 690px) 119 | 120 | # LICENSE 121 | 122 | 此项目使用 [MIT](LICENSE) 开源协议 123 | 124 | **注意**:使用此工具下载的所有内容,版权归原作者所有,请谨慎使用! 125 | -------------------------------------------------------------------------------- /docs/README_CN.md: -------------------------------------------------------------------------------- 1 | # Weibo Image Spider 2 | 3 | 微博图片爬虫,极速下载、高清原图、多种命令、简单实用。 4 | 5 | ### 特点: 6 | 7 | - [x] 极速下载:多线程异步下载,可以根据需要设置线程数 8 | - [x] 异常重试:只要重试次数足够多,就没有下载不下来的图片 \(^o^)/! 9 | - [x] 增量下载:用户有新的上传,再跑一遍程序就行了 O(∩_∩)O 嗯! 10 | - [x] 高清原图:默认下载高清原图,可以使用参数 `--thumbnail` 下载缩略图(宽最大 690px) 11 | 12 | ### 环境: 13 | 14 | - `python3.6` 及以上 15 | 16 | # 快速使用 17 | 18 | ## 1. 克隆项目到本地 19 | 20 | ```sh 21 | $ git clone https://github.com/lonsty/weibo-image-spider.git 22 | ``` 23 | 24 | ## 2. 安装依赖包 25 | 26 | ```sh 27 | $ cd weibo-image-spider 28 | $ pip install -r requirements.txt 29 | ``` 30 | 31 | ## 3. 快速使用 32 | 33 | **注意**: 34 | 35 | *因网页版微博限制,使用爬虫请求其 API 时,需要 cookie 认证,关于 [如何获取 cookie](get_cookie.md)? 36 | 且 cookie 有效期为一天(第二天零点失效),所以最好不要跨天爬取。* 37 | 38 | 下载用户昵称为 `nickname` 的最新 2000(可使用 `-n` 修改) 张图片到路径 `dest` 下: 39 | 40 | ```sh 41 | $ python main.py -u -d 42 | ``` 43 | 44 | 运行截图 45 | 46 | ![screenshot_1.png](docs/screenshot_1.png) 47 | 48 | 爬取结果 49 | 50 | ![screenshot_2.png](docs/screenshot_2.png) 51 | 52 | # 使用帮助 53 | 54 | ### 常用命令 55 | 56 | - 部分图片 **下载失败** 或 **微博有更新**,再执行相同的命令,对失败或新增的图片进行下载 57 | 58 | ```sh 59 | $ python main.py -u -d 60 | ``` 61 | 62 | ### 查看所有命令 63 | 64 | ``` 65 | $ python main.py --help 66 | 67 | Usage: main.py [OPTIONS] 68 | 69 | A Weibo image spider, visit https://github.com/lonsty/weibo-image-spider. 70 | 71 | Options: 72 | -u, --nickname TEXT Nickname 73 | -d, --destination TEXT Directory to save images [default: 74 | weibo_images/] 75 | 76 | -o, --overwrite Overwrite existing files [default: False] 77 | -t, --thumbnail Download thumbnails with a maximum width of 690px 78 | [default: False] 79 | 80 | -n, --max-images INTEGER Maximum number of images to download [default: 81 | 2000] 82 | 83 | -w, --max-workers INTEGER Maximum thread workers [default: 15] 84 | -P, --proxies TEXT Use proxies to access websites. Example: 85 | '{"http": "user:passwd@www.example.com:port", 86 | "https": "user:passwd@www.example.com:port"}' 87 | 88 | --help Show this message and exit. 89 | ``` 90 | 91 | # 更新历史 92 | 93 | - ## Version 0.1.0a (2020-03-29) 94 | 95 | 主要功能: 96 | 97 | - 极速下载:多线程异步下载,可以根据需要设置线程数 98 | - 异常重试:只要重试次数足够多,就没有下载不下来的图片 \(^o^)/! 99 | - 增量下载:用户有新的上传,再跑一遍程序就行了 O(∩_∩)O 嗯! 100 | - 高清原图:默认下载高清原图,可以使用参数 `--thumbnail` 下载缩略图(宽最大 690px) 101 | 102 | # LICENSE 103 | 104 | 此项目使用 [MIT](LICENSE) 开源协议 105 | 106 | **注意**:使用此工具下载的所有内容,版权归原作者所有,请谨慎使用! 107 | -------------------------------------------------------------------------------- /docs/get_cookie.md: -------------------------------------------------------------------------------- 1 | ## 获取网页版微博的 cookie: 2 | 3 | 1. 前往微博主页 [https://www.weibo.com/](https://www.weibo.com/),并使用个人账号完成登录; 4 | 5 | 2. 以 Google Chrome 浏览器为例,按 F12 打开开发者模式,依次点击「Network」→「XHR」,然后按 F5 刷新,在 XHR 记录中随便选中一条,点开 Headers,复制 Request Headers 中的 Cookie 值,将其粘贴在根目录的 [cookie](../cookie) 文件中,覆盖原内容即可。 6 | 7 | ![get_cookie](get_cookie.png) 8 | -------------------------------------------------------------------------------- /docs/get_cookie.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lonsty/weibo-image-spider/c7dae38b51209296cc8e71aa6fb80f094d549198/docs/get_cookie.png -------------------------------------------------------------------------------- /docs/screenshot_1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lonsty/weibo-image-spider/c7dae38b51209296cc8e71aa6fb80f094d549198/docs/screenshot_1.png -------------------------------------------------------------------------------- /docs/screenshot_2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lonsty/weibo-image-spider/c7dae38b51209296cc8e71aa6fb80f094d549198/docs/screenshot_2.png -------------------------------------------------------------------------------- /main.py: -------------------------------------------------------------------------------- 1 | # @AUTHOR : lonsty 2 | # @DATE : 2020/3/28 14:24 3 | from weibo_image_spider.cli import weibo_command 4 | 5 | if __name__ == "__main__": 6 | weibo_command() 7 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | beautifulsoup4==4.8.2 2 | click==7.1.1 3 | termcolor==1.1.0 4 | requests==2.23.0 5 | pydantic==1.4 6 | -------------------------------------------------------------------------------- /weibo_image_spider/__init__.py: -------------------------------------------------------------------------------- 1 | # @AUTHOR : lonsty 2 | # @DATE : 2020/3/28 14:22 3 | from .spider_workers import crawl_worker, download_worker, query_user_by_name 4 | 5 | __author__ = "Allen Shaw" 6 | __version__ = "0.1.0" 7 | 8 | __all__ = [] 9 | -------------------------------------------------------------------------------- /weibo_image_spider/cli.py: -------------------------------------------------------------------------------- 1 | # @AUTHOR : lonsty 2 | # @DATE : 2020/3/28 18:46 3 | import json 4 | import logging 5 | from concurrent.futures import ThreadPoolExecutor, wait 6 | 7 | import click 8 | from pydantic import ValidationError 9 | from requests.exceptions import ConnectionError, RequestException 10 | from termcolor import colored 11 | 12 | from weibo_image_spider.constants import Constant 13 | from weibo_image_spider.models import Parameters, PhotoAPI 14 | from weibo_image_spider.spider_workers import crawl_worker, download_worker, query_user_by_name 15 | from weibo_image_spider.utils import mkdirs_if_not_exist, quit, save_records 16 | 17 | 18 | @click.command(help="A Weibo image spider, visit https://github.com/lonsty/weibo-image-spider.") 19 | @click.option("-u", "--nickname", "nickname", help="Nickname") 20 | @click.option( 21 | "-d", "--destination", "destination", default="weibo_images/", show_default=True, help="Directory to save images" 22 | ) 23 | @click.option( 24 | "-o", "--overwrite", "overwrite", is_flag=True, default=False, show_default=True, help="Overwrite existing files" 25 | ) 26 | @click.option( 27 | "-t", 28 | "--thumbnail", 29 | "thumbnail", 30 | is_flag=True, 31 | default=False, 32 | show_default=True, 33 | help="Download thumbnails with a maximum width of 690px", 34 | ) 35 | @click.option( 36 | "-n", 37 | "--max-images", 38 | "max_images", 39 | default=2000, 40 | show_default=True, 41 | type=int, 42 | help="Maximum number of images to download", 43 | ) 44 | @click.option( 45 | "-w", "--max-workers", "max_workers", default=15, show_default=True, type=int, help="Maximum thread workers" 46 | ) 47 | @click.option( 48 | "-P", 49 | "--proxies", 50 | "proxies_raw", 51 | help="Use proxies to access websites.\nExample:\n'" 52 | '{"http": "user:password@example.com:port",\n' 53 | '"https": "user:password@example.com:port"}\'', 54 | ) 55 | @click.option( 56 | "-v", 57 | "--verbose", 58 | "verbose", 59 | is_flag=True, 60 | help="Show more information for debugging", 61 | ) 62 | def weibo_command(**kwargs): 63 | try: 64 | paras = Parameters(**kwargs) 65 | const = Constant(**paras.dict()) 66 | except ValidationError as e: 67 | quit("Invalid arguments: " + ", ".join([f'{a["loc"][0]} - {a["msg"]}' for a in json.loads(e.json())]), 1) 68 | 69 | logging.basicConfig( 70 | level=logging.INFO if const.verbose else logging.ERROR, 71 | format="[%(asctime)s %(threadName)-23s %(levelname)-5s %(lineno)3d] %(message)s", 72 | ) 73 | 74 | try: 75 | const.user = query_user_by_name(const) 76 | except (ConnectionError, RequestException) as e: 77 | quit(f"Network error: {e}", 1) 78 | 79 | mkdirs_if_not_exist(const.saved_dir) 80 | print( 81 | f"\n - - - - - -+-+ {const.status.start_time_repr} +-+- - - - - -\n" 82 | f' Nickname: {colored(const.user.name, "cyan")}\n' 83 | f' User ID: {colored(const.user.uid, "cyan")}\n' 84 | f'Destination: {colored(const.saved_dir, attrs=["underline"])}\n' 85 | f" Overwrite: {const.overwrite}\n" 86 | f" Thumbnail: {const.thumbnail}\n" 87 | f" Max images: {const.max_images}\n" 88 | ) 89 | 90 | const.photo_api = PhotoAPI( 91 | action_data=f"type=photo&owner_uid={const.user.uid}&viewer_uid={const.user.uid}" f"&since_id=-1", 92 | page_id=int(f"100505{const.user.uid}"), 93 | page=1, 94 | ) 95 | 96 | with ThreadPoolExecutor(max_workers=const.max_workers + 1) as pool: 97 | img_crawler = pool.submit(crawl_worker, const) 98 | img_downloader = [pool.submit(download_worker, const) for _ in range(const.max_workers)] 99 | wait([img_crawler] + img_downloader) 100 | 101 | save_records(const) 102 | quit("\n\nDownload completed, bye bye ~") 103 | -------------------------------------------------------------------------------- /weibo_image_spider/constants.py: -------------------------------------------------------------------------------- 1 | # @AUTHOR : lonsty 2 | # @DATE : 2020/3/28 14:27 3 | import json 4 | import logging 5 | import os 6 | import random 7 | import re 8 | import time 9 | # from random import choice, random 10 | from typing import List 11 | 12 | from pydantic import BaseModel 13 | from weibo_image_spider.models import PhotoAPI, Status, User 14 | from weibo_image_spider.utils import convert_to_safe_filename, read_cookie 15 | 16 | 17 | class Constant(BaseModel): 18 | search_url: str = "https://s.weibo.com/user?q={user}&Refer=weibo_user" 19 | search_api: str = "https://s.weibo.com/ajax/topsuggest.php?key={user}&_k={ts}&_t=1&outjson=1&uid={uid}" 20 | img_hosts: List[str] = ["https://wx1.sinaimg.cn", "https://wx2.sinaimg.cn", "https://wx3.sinaimg.cn"] 21 | cookies_raw: str = "" 22 | user: User = User() 23 | photo_api: PhotoAPI = PhotoAPI() 24 | status: Status = Status() 25 | nickname: str = "lonsty" 26 | destination: str = "weibo_images" 27 | overwrite: bool = False 28 | thumbnail: bool = False 29 | max_images: int = 2000 30 | max_workers: int = 15 31 | proxies_raw: str = None 32 | timeout: int = 10 33 | cancel: bool = False 34 | end_crawler: bool = False 35 | verbose: bool = False 36 | 37 | def __init__(self, **kargs): 38 | super(Constant, self).__init__(**kargs) 39 | self.cookies_raw = read_cookie() 40 | 41 | @property 42 | def cookies(self): 43 | try: 44 | return dict([item.split("=")[0], item.split("=")[1]] for item in self.cookies_raw.split("; ")) 45 | except Exception as e: 46 | logging.warning(e) 47 | return None 48 | 49 | @property 50 | def img_url_prefix(self): 51 | return f'{random.choice(self.img_hosts)}/{"large" if not self.thumbnail else "mw690"}/' 52 | 53 | @property 54 | def saved_dir(self): 55 | return os.path.join(os.path.abspath(self.destination), convert_to_safe_filename(self.user.name)) 56 | 57 | @property 58 | def rex_pattern(self): 59 | return re.compile("(?<=/)\w*?\.(?:jpg|gif)", re.IGNORECASE) 60 | 61 | @property 62 | def user_photo_api(self): 63 | return self.photo_api.api 64 | 65 | @property 66 | def user_search_api(self): 67 | return self.search_api.format( 68 | user=self.nickname, ts=int(time.time() * 1000), uid=random.randrange(1_000_000_000, 9_999_999_999) 69 | ) 70 | 71 | @property 72 | def proxies(self): 73 | if isinstance(self.proxies_raw, str): 74 | try: 75 | return json.loads(self.proxies_raw) 76 | except Exception as e: 77 | logging.warning(f"Proxy will not be used: {e}") 78 | return None 79 | return None 80 | -------------------------------------------------------------------------------- /weibo_image_spider/exceptions.py: -------------------------------------------------------------------------------- 1 | # @AUTHOR : lonsty 2 | # @DATE : 2020/3/28 18:01 3 | 4 | 5 | class CookiesExpiredException(Exception): 6 | pass 7 | 8 | 9 | class NoImagesException(Exception): 10 | pass 11 | 12 | 13 | class ContentParserError(Exception): 14 | pass 15 | 16 | 17 | class UserNotFound(Exception): 18 | pass 19 | -------------------------------------------------------------------------------- /weibo_image_spider/models.py: -------------------------------------------------------------------------------- 1 | # @AUTHOR : lonsty 2 | # @DATE : 2020/3/28 15:30 3 | import time 4 | from datetime import datetime 5 | from queue import Queue 6 | 7 | from pydantic import BaseModel 8 | from termcolor import colored 9 | 10 | downloading_jobs = Queue() 11 | appointment_jobs = Queue() 12 | 13 | 14 | class User(BaseModel): 15 | name = "" 16 | uid: int = 0 17 | host: str = "" 18 | 19 | 20 | class PhotoAPI(BaseModel): 21 | action_data: str = "" 22 | page_id: int = 0 23 | page: int = 1 24 | 25 | @property 26 | def api(self): 27 | return ( 28 | f"https://weibo.com/p/aj/album/loading?ajwvr=6&{self.action_data}" 29 | f"&page_id={self.page_id}&page={self.page}&ajax_call=1&__rnd={self.rnd}" 30 | ) 31 | 32 | @property 33 | def rnd(self): 34 | return int(time.time() * 1000) 35 | 36 | 37 | class Parameters(BaseModel): 38 | nickname = "" 39 | uid: int = 0 40 | destination: str 41 | overwrite: bool 42 | thumbnail: bool 43 | max_images: int 44 | max_workers: int 45 | verbose: bool 46 | 47 | 48 | class Status(BaseModel): 49 | succeed = [] 50 | failed = [] 51 | start_time = datetime.now() 52 | 53 | @property 54 | def total_complete(self): 55 | return len(self.succeed) + len(self.failed) 56 | 57 | @property 58 | def start_time_repr(self): 59 | return self.start_time.ctime() 60 | 61 | @property 62 | def time_used(self): 63 | return str(datetime.now() - self.start_time)[:-7] 64 | 65 | @property 66 | def fmt_status(self): 67 | return ( 68 | f'[Succeed: {colored(str(len(self.succeed)), "green")}, ' 69 | f'Failed: {colored(str(len(self.failed)), "red")}]' 70 | ) 71 | -------------------------------------------------------------------------------- /weibo_image_spider/spider_workers.py: -------------------------------------------------------------------------------- 1 | # @AUTHOR : lonsty 2 | # @DATE : 2020/3/28 14:24 3 | import logging 4 | import os 5 | import queue 6 | import threading 7 | 8 | from bs4 import BeautifulSoup 9 | from requests import Session 10 | from requests.exceptions import ConnectionError, RequestException 11 | from termcolor import colored 12 | 13 | from .constants import Constant 14 | from .exceptions import ContentParserError, CookiesExpiredException, NoImagesException, UserNotFound 15 | from .models import User, appointment_jobs, downloading_jobs 16 | from .utils import get_session, retry, save_cookie 17 | 18 | lock = threading.RLock() 19 | 20 | 21 | @retry(logger=logging) 22 | def query_user_by_name(const: Constant): 23 | session = get_session() 24 | 25 | try: 26 | logging.info(f"Getting information of username: {const.nickname}...") 27 | resp = session.get(const.user_search_api, cookies=const.cookies, proxies=const.proxies, timeout=const.timeout) 28 | resp.raise_for_status() 29 | except Exception as e: 30 | logging.info(f"Getting user information error: {e}") 31 | raise ConnectionError(e) 32 | 33 | try: 34 | logging.info("Initialing a BeautifulSoup...") 35 | first = resp.json()["user"][0] 36 | name = first["u_name"] 37 | uid = first["u_id"] 38 | except (KeyError, IndexError) as e: 39 | logging.info(f"Parsing user information error: {e}") 40 | raise ContentParserError( 41 | "Weibo API updated, please add a issue " "to https://github.com/lonsty/weibo-image-spider/issues." 42 | ) 43 | user = User(name=name, host=f"https://weibo.com/u/{uid}", uid=uid) 44 | logging.info(f"Got information of username: {const.nickname}, {user}") 45 | 46 | return user 47 | 48 | 49 | @retry((RequestException, CookiesExpiredException), logger=logging) 50 | def crawl_image(const: Constant, url: str, session: Session): 51 | try: 52 | logging.info(f"Getting urls from page...") 53 | resp = session.get(url, cookies=const.cookies, proxies=const.proxies, timeout=const.timeout) 54 | resp.raise_for_status() 55 | except Exception as e: 56 | logging.info(f"Getting urls from page error: {e}") 57 | raise RequestException(e) 58 | 59 | try: 60 | logging.info(f"Parsing urls from page...") 61 | soup = BeautifulSoup(resp.json().get("data"), "html.parser") 62 | boxes = soup.find_all("a", class_="ph_ar_box") 63 | for box in boxes: 64 | img = const.rex_pattern.search(box.find("img").get("src")).group(0) 65 | downloading_jobs.put(img) 66 | logging.info(f"Parsed {len(boxes)} urls from page") 67 | except Exception as e: 68 | logging.info(f"Parsing urls from page error: {e}") 69 | raise CookiesExpiredException("Cookie has expired, please get a new one and paste to here:\n") 70 | 71 | logging.info(f"Parsing action-data from page...") 72 | card = soup.find("div", class_="WB_cardwrap") 73 | if not card: 74 | logging.info(f"No action-data in page") 75 | raise NoImagesException("No more images to crawl") 76 | 77 | action_data = card.get("action-data") 78 | const.photo_api.action_data = action_data 79 | logging.info(f"Got action-data from page: {action_data}") 80 | 81 | 82 | def crawl_worker(const: Constant): 83 | page = 1 84 | session = get_session() 85 | 86 | while appointment_jobs.qsize() < const.max_images: 87 | const.photo_api.page = page 88 | try: 89 | logging.info(f"Crawling page {page}...") 90 | crawl_image(const, const.user_photo_api, session) 91 | logging.info(f"Crawled page {page}") 92 | except CookiesExpiredException as e: 93 | logging.info(f"Cookies has expired, need new cookies") 94 | const.cookies_raw = input(str(e)) 95 | save_cookie(const.cookies_raw) 96 | logging.info(f"Saved new cookies") 97 | continue 98 | except (NoImagesException, Exception) as e: 99 | logging.info(f"Crawling page: {e}") 100 | break 101 | page += 1 102 | const.end_crawler = True 103 | 104 | 105 | @retry(logger=logging) 106 | def download_image(const: Constant, img: str, session: Session): 107 | url = const.img_url_prefix + img 108 | filename = os.path.join(const.saved_dir, img) 109 | 110 | if (not const.overwrite) and os.path.isfile(filename): 111 | logging.info(f"Skipped downloaded image: {filename}") 112 | return url 113 | 114 | try: 115 | logging.info(f"Heading image...") 116 | head = session.get(url, cookies=const.cookies, proxies=const.proxies, timeout=const.timeout) 117 | head.raise_for_status() 118 | image_size = int(head.headers["Content-Length"].strip()) 119 | logging.info(f"Got image: {url} size: {image_size}") 120 | except Exception as e: 121 | logging.info(f"Heading image error: {e}") 122 | raise RequestException(e) 123 | 124 | try: 125 | logging.info(f"Downloading image...") 126 | resp = session.get(url, cookies=const.cookies, proxies=const.proxies, stream=True, timeout=const.timeout) 127 | resp.raise_for_status() 128 | logging.info(f"Downloaded image") 129 | except Exception as e: 130 | logging.info(f"Downloading image error: {e}") 131 | raise RequestException(e) 132 | 133 | write_size = 0 134 | with open(filename, "wb") as f: 135 | for chunk in resp.iter_content(chunk_size=8192): 136 | f.write(chunk) 137 | write_size += len(chunk) 138 | 139 | if write_size < image_size: 140 | os.remove(filename) 141 | logging.info(f"Saving image error: image is incomplete") 142 | raise RequestException("The downloaded image is incomplete") 143 | logging.info(f"Saved image: {filename}") 144 | 145 | return url 146 | 147 | 148 | def download_worker(const: Constant): 149 | session = get_session() 150 | 151 | while appointment_jobs.qsize() < const.max_images: 152 | try: 153 | img = downloading_jobs.get_nowait() 154 | with lock: 155 | if appointment_jobs.qsize() < const.max_images: 156 | appointment_jobs.put(img) 157 | else: 158 | break 159 | logging.info(f"Download worker start...") 160 | result = download_image(const, img, session) 161 | except queue.Empty: 162 | if const.cancel or const.end_crawler: 163 | break 164 | except Exception as e: 165 | logging.info(f"Download worker error: {e}") 166 | result = const.img_url_prefix + img 167 | const.status.failed.append(result) 168 | print( 169 | f'{colored("[x]", "red", attrs=["reverse"])} {colored(result, attrs=["underline"])}\t' 170 | f"{const.status.fmt_status}", 171 | end="\r" if not const.verbose else "\n", 172 | flush=True, 173 | ) 174 | else: 175 | logging.info(f"Download worker succeed") 176 | const.status.succeed.append(result) 177 | print( 178 | f'{colored("[√]", "green", attrs=["reverse"])} {colored(result, attrs=["underline"])}\t' 179 | f"{const.status.fmt_status}", 180 | end="\r" if not const.verbose else "\n", 181 | flush=True, 182 | ) 183 | -------------------------------------------------------------------------------- /weibo_image_spider/utils.py: -------------------------------------------------------------------------------- 1 | # @AUTHOR : lonsty 2 | # @DATE : 2020/3/28 14:23 3 | import json 4 | import os 5 | import random 6 | import sys 7 | import threading 8 | import time 9 | from functools import wraps 10 | 11 | from requests import Session 12 | 13 | thread_local = threading.local() 14 | 15 | 16 | def cookies_from_raw(raw): 17 | return dict([line.split("=")[0], line.split("=")[1]] for line in raw.split("; ")) 18 | 19 | 20 | def get_session(): 21 | if not hasattr(thread_local, "session"): 22 | thread_local.session = Session() 23 | return thread_local.session 24 | 25 | 26 | def retry(exceptions=Exception, tries=3, delay=1, backoff=2, logger=None): 27 | """ 28 | Retry calling the decorated function using an exponential backoff. 29 | Args: 30 | exceptions: The exception to check. may be a tuple of 31 | exceptions to check. 32 | tries: Number of times to try (not retry) before giving up. 33 | delay: Initial delay between retries in seconds. 34 | backoff: Backoff multiplier (e.g. value of 2 will double the delay 35 | each retry). 36 | logger: Logger to use. If None, print. 37 | """ 38 | 39 | def deco_retry(f): 40 | @wraps(f) 41 | def f_retry(*args, **kwargs): 42 | mtries, mdelay = tries, delay or random.uniform(0.5, 1.5) 43 | while mtries > 1: 44 | try: 45 | return f(*args, **kwargs) 46 | except exceptions as e: 47 | if logger: 48 | logger.error("{}, Retrying in {} seconds...".format(e, mdelay)) 49 | else: 50 | print("\n{}, Retrying in {} seconds...".format(e, mdelay)) 51 | time.sleep(mdelay) 52 | mtries -= 1 53 | mdelay *= backoff 54 | return f(*args, **kwargs) 55 | 56 | return f_retry 57 | 58 | return deco_retry 59 | 60 | 61 | def mkdirs_if_not_exist(dir): 62 | if not os.path.isdir(dir): 63 | try: 64 | os.makedirs(dir) 65 | except FileExistsError: 66 | pass 67 | 68 | 69 | def convert_to_safe_filename(filename): 70 | return "".join([c for c in filename if c not in r'\/:*?"<>|']).strip() 71 | 72 | 73 | def read_cookie(): 74 | with open("cookie", "r") as f: 75 | return f.read().strip() 76 | 77 | 78 | def save_cookie(cookie): 79 | with open("cookie", "w") as f: 80 | f.write(cookie) 81 | 82 | 83 | def quit(msg, code=0): 84 | print(msg) 85 | sys.exit(code) 86 | 87 | 88 | def save_records(c): 89 | filename = os.path.join(c.saved_dir, c.status.start_time.strftime("%Y-%m-%d_%H-%M-%S") + ".json") 90 | with open(filename, "w") as f: 91 | f.write( 92 | json.dumps( 93 | { 94 | "nickname": c.user.name, 95 | "uid": c.user.uid, 96 | "datetime": c.status.start_time_repr, 97 | "succeed": {"count": len(c.status.succeed), "urls": c.status.succeed}, 98 | "failed": {"count": len(c.status.failed), "urls": c.status.failed}, 99 | }, 100 | ensure_ascii=False, 101 | indent=2, 102 | ) 103 | ) 104 | --------------------------------------------------------------------------------