├── .gitignore
├── LICENSE
├── Pipfile
├── Pipfile.lock
├── README.md
├── docs
    ├── README_CN.md
    ├── get_cookie.md
    ├── get_cookie.png
    ├── screenshot_1.png
    └── screenshot_2.png
├── main.py
├── requirements.txt
└── weibo_image_spider
    ├── __init__.py
    ├── cli.py
    ├── constants.py
    ├── exceptions.py
    ├── models.py
    ├── spider_workers.py
    └── utils.py


/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | build/
 12 | develop-eggs/
 13 | dist/
 14 | downloads/
 15 | eggs/
 16 | .eggs/
 17 | lib/
 18 | lib64/
 19 | parts/
 20 | sdist/
 21 | var/
 22 | wheels/
 23 | *.egg-info/
 24 | .installed.cfg
 25 | *.egg
 26 | MANIFEST
 27 | 
 28 | # PyInstaller
 29 | #  Usually these files are written by a python script from a template
 30 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 31 | *.manifest
 32 | *.spec
 33 | 
 34 | # Installer logs
 35 | pip-log.txt
 36 | pip-delete-this-directory.txt
 37 | 
 38 | # Unit test / coverage reports
 39 | htmlcov/
 40 | .tox/
 41 | .coverage
 42 | .coverage.*
 43 | .cache
 44 | nosetests.xml
 45 | coverage.xml
 46 | *.cover
 47 | .hypothesis/
 48 | .pytest_cache/
 49 | 
 50 | # Translations
 51 | *.mo
 52 | *.pot
 53 | 
 54 | # Django stuff:
 55 | *.log
 56 | local_settings.py
 57 | db.sqlite3
 58 | 
 59 | # Flask stuff:
 60 | instance/
 61 | .webassets-cache
 62 | 
 63 | # Scrapy stuff:
 64 | .scrapy
 65 | 
 66 | # Sphinx documentation
 67 | docs/_build/
 68 | 
 69 | # PyBuilder
 70 | target/
 71 | 
 72 | # Jupyter Notebook
 73 | .ipynb_checkpoints
 74 | 
 75 | # pyenv
 76 | .python-version
 77 | 
 78 | # celery beat schedule file
 79 | celerybeat-schedule
 80 | 
 81 | # SageMath parsed files
 82 | *.sage.py
 83 | 
 84 | # Environments
 85 | .env
 86 | .venv
 87 | env/
 88 | venv/
 89 | ENV/
 90 | env.bak/
 91 | venv.bak/
 92 | 
 93 | # Spyder project settings
 94 | .spyderproject
 95 | .spyproject
 96 | 
 97 | # Rope project settings
 98 | .ropeproject
 99 | 
100 | # mkdocs documentation
101 | /site
102 | 
103 | # mypy
104 | .mypy_cache/
105 | 
106 | .idea
107 | weibo_images/
108 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2020 Allen Shaw
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/Pipfile:
--------------------------------------------------------------------------------
 1 | [[source]]
 2 | name = "pypi"
 3 | url = "https://pypi.org/simple"
 4 | verify_ssl = true
 5 | 
 6 | [dev-packages]
 7 | 
 8 | [packages]
 9 | beautifulsoup4 = "==4.8.2"
10 | click = "==7.1.1"
11 | termcolor = "==1.1.0"
12 | requests = "==2.23.0"
13 | pydantic = "==1.4"
14 | 
15 | [requires]
16 | python_version = "3.6"
17 | 


--------------------------------------------------------------------------------
/Pipfile.lock:
--------------------------------------------------------------------------------
  1 | {
  2 |     "_meta": {
  3 |         "hash": {
  4 |             "sha256": "29cab4f0b5fdb1c0d9a58d4d31abc8c8a71a3cd23750e9c65bd93b0ce19f0728"
  5 |         },
  6 |         "pipfile-spec": 6,
  7 |         "requires": {
  8 |             "python_version": "3.6"
  9 |         },
 10 |         "sources": [
 11 |             {
 12 |                 "name": "pypi",
 13 |                 "url": "https://pypi.org/simple",
 14 |                 "verify_ssl": true
 15 |             }
 16 |         ]
 17 |     },
 18 |     "default": {
 19 |         "beautifulsoup4": {
 20 |             "hashes": [
 21 |                 "sha256:05fd825eb01c290877657a56df4c6e4c311b3965bda790c613a3d6fb01a5462a",
 22 |                 "sha256:9fbb4d6e48ecd30bcacc5b63b94088192dcda178513b2ae3c394229f8911b887",
 23 |                 "sha256:e1505eeed31b0f4ce2dbb3bc8eb256c04cc2b3b72af7d551a4ab6efd5cbe5dae"
 24 |             ],
 25 |             "index": "pypi",
 26 |             "version": "==4.8.2"
 27 |         },
 28 |         "certifi": {
 29 |             "hashes": [
 30 |                 "sha256:017c25db2a153ce562900032d5bc68e9f191e44e9a0f762f373977de9df1fbb3",
 31 |                 "sha256:25b64c7da4cd7479594d035c08c2d809eb4aab3a26e5a990ea98cc450c320f1f"
 32 |             ],
 33 |             "version": "==2019.11.28"
 34 |         },
 35 |         "chardet": {
 36 |             "hashes": [
 37 |                 "sha256:84ab92ed1c4d4f16916e05906b6b75a6c0fb5db821cc65e70cbd64a3e2a5eaae",
 38 |                 "sha256:fc323ffcaeaed0e0a02bf4d117757b98aed530d9ed4531e3e15460124c106691"
 39 |             ],
 40 |             "version": "==3.0.4"
 41 |         },
 42 |         "click": {
 43 |             "hashes": [
 44 |                 "sha256:8a18b4ea89d8820c5d0c7da8a64b2c324b4dabb695804dbfea19b9be9d88c0cc",
 45 |                 "sha256:e345d143d80bf5ee7534056164e5e112ea5e22716bbb1ce727941f4c8b471b9a"
 46 |             ],
 47 |             "index": "pypi",
 48 |             "version": "==7.1.1"
 49 |         },
 50 |         "dataclasses": {
 51 |             "hashes": [
 52 |                 "sha256:3459118f7ede7c8bea0fe795bff7c6c2ce287d01dd226202f7c9ebc0610a7836",
 53 |                 "sha256:494a6dcae3b8bcf80848eea2ef64c0cc5cd307ffc263e17cdf42f3e5420808e6"
 54 |             ],
 55 |             "markers": "python_version < '3.7'",
 56 |             "version": "==0.7"
 57 |         },
 58 |         "idna": {
 59 |             "hashes": [
 60 |                 "sha256:7588d1c14ae4c77d74036e8c22ff447b26d0fde8f007354fd48a7814db15b7cb",
 61 |                 "sha256:a068a21ceac8a4d63dbfd964670474107f541babbd2250d61922f029858365fa"
 62 |             ],
 63 |             "version": "==2.9"
 64 |         },
 65 |         "pydantic": {
 66 |             "hashes": [
 67 |                 "sha256:012c422859bac2e03ab3151ea6624fecf0e249486be7eb8c6ee69c91740c6752",
 68 |                 "sha256:07911aab70f3bc52bb845ce1748569c5e70478ac977e106a150dd9d0465ebf04",
 69 |                 "sha256:47b8db7024ba3d46c3d4768535e1cf87b6c8cf92ccd81e76f4e1cb8ee47688b3",
 70 |                 "sha256:50e4e948892a6815649ad5a9a9379ad1e5f090f17842ac206535dfaed75c6f2f",
 71 |                 "sha256:51f11c8bbf794a68086540da099aae4a9107447c7a9d63151edbb7d50110cf21",
 72 |                 "sha256:6100d7862371115c40be55cc4b8d766a74b1d0dbaf99dbfe72bb4bac0faf89ed",
 73 |                 "sha256:61d22d36808087d3184ed6ac0d91dd71c533b66addb02e4a9930e1e30833202f",
 74 |                 "sha256:72184c1421103cca128300120f8f1185fb42a9ea73a1c9845b1c53db8c026a7d",
 75 |                 "sha256:831a0265a9e3933b3d0f04d1a81bba543bafbe4119c183ff2771871db70524ab",
 76 |                 "sha256:8848b4eb458469739126e4c1a202d723dd092e087f8dbe3104371335f87ba5df",
 77 |                 "sha256:bbbed364376f4a0aebb9ea452ff7968b306499a9e74f4db69b28ff2cd4043a11",
 78 |                 "sha256:e27559cedbd7f59d2375bfd6eea29a330ea1a5b0589c34d6b4e0d7bec6027bbf",
 79 |                 "sha256:f17ec336e64d4583311249fb179528e9a2c27c8a2eaf590ec6ec2c6dece7cb3f",
 80 |                 "sha256:f863456d3d4bf817f2e5248553dee3974c5dc796f48e6ddb599383570f4215ac"
 81 |             ],
 82 |             "index": "pypi",
 83 |             "version": "==1.4"
 84 |         },
 85 |         "requests": {
 86 |             "hashes": [
 87 |                 "sha256:43999036bfa82904b6af1d99e4882b560e5e2c68e5c4b0aa03b655f3d7d73fee",
 88 |                 "sha256:b3f43d496c6daba4493e7c431722aeb7dbc6288f52a6e04e7b6023b0247817e6"
 89 |             ],
 90 |             "index": "pypi",
 91 |             "version": "==2.23.0"
 92 |         },
 93 |         "soupsieve": {
 94 |             "hashes": [
 95 |                 "sha256:e914534802d7ffd233242b785229d5ba0766a7f487385e3f714446a07bf540ae",
 96 |                 "sha256:fcd71e08c0aee99aca1b73f45478549ee7e7fc006d51b37bec9e9def7dc22b69"
 97 |             ],
 98 |             "version": "==2.0"
 99 |         },
100 |         "termcolor": {
101 |             "hashes": [
102 |                 "sha256:1d6d69ce66211143803fbc56652b41d73b4a400a2891d7bf7a1cdf4c02de613b"
103 |             ],
104 |             "index": "pypi",
105 |             "version": "==1.1.0"
106 |         },
107 |         "urllib3": {
108 |             "hashes": [
109 |                 "sha256:2f3db8b19923a873b3e5256dc9c2dedfa883e33d87c690d9c7913e1f40673cdc",
110 |                 "sha256:87716c2d2a7121198ebcb7ce7cccf6ce5e9ba539041cfbaeecfb641dc0bf6acc"
111 |             ],
112 |             "version": "==1.25.8"
113 |         }
114 |     },
115 |     "develop": {}
116 | }
117 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # Weibo Image Spider
  2 | 
  3 | 微博图片爬虫，极速下载、高清原图、多种命令、简单实用。
  4 | 
  5 | ### 特点：
  6 | 
  7 | - [x] 极速下载：多线程异步下载，可以根据需要设置线程数
  8 | - [x] 异常重试：只要重试次数足够多，就没有下载不下来的图片 \(^o^)/！
  9 | - [x] 增量下载：用户有新的上传，再跑一遍程序就行了 O(∩_∩)O 嗯！
 10 | - [x] 高清原图：默认下载高清原图，可以使用参数 `--thumbnail` 下载缩略图（宽最大 690px）
 11 | 
 12 | ### 环境：
 13 | 
 14 | - `python3.6` 及以上
 15 | 
 16 | # 快速使用
 17 | 
 18 | ## 1. 克隆项目到本地
 19 | 
 20 | ```sh
 21 | $ git clone https://github.com/lonsty/weibo-image-spider.git
 22 | ```
 23 | 
 24 | ## 2. 安装依赖包
 25 | 
 26 | ```sh
 27 | $ cd weibo-image-spider
 28 | $ pip install -r requirements.txt
 29 | ```
 30 | 
 31 | ## 3. 快速使用
 32 | 
 33 | **注意**：
 34 | 
 35 | *因网页版微博限制，使用爬虫请求其 API 时，需要 cookie 认证，关于 [如何获取 cookie](docs/get_cookie.md)？
 36 | 且 cookie 有效期为一天（第二天零点失效），所以最好不要跨天爬取。*
 37 | 
 38 | 下载用户昵称或用户ID 为 `nickname`（或 `user-id`） 的最新 2000（可使用 `-n` 修改） 张图片到路径 `dest` 下：
 39 | 
 40 | ```sh
 41 | $ python main.py -u <nickname/user-id> -d <dest>
 42 | ```
 43 | 
 44 | 运行截图
 45 | 
 46 | ![screenshot_1.png](docs/screenshot_1.png)
 47 | 
 48 | 爬取结果
 49 | 
 50 | ![screenshot_2.png](docs/screenshot_2.png)
 51 | 
 52 | # 使用帮助
 53 | 
 54 | ### 常用命令
 55 | 
 56 | - 部分图片 **下载失败** 或 **微博有更新**，再执行相同的命令，对失败或新增的图片进行下载
 57 | 
 58 | ```sh
 59 | $ python main.py -u <nickname> -d <dest>
 60 | ```
 61 | 
 62 | ### 查看所有命令
 63 | 
 64 | ```
 65 | $ python main.py --help
 66 | 
 67 | Usage: main.py [OPTIONS]
 68 | 
 69 |   A Weibo image spider, visit https://github.com/lonsty/weibo-image-spider.
 70 | 
 71 | Options:
 72 |   -u, --nickname, --user-id TEXT  Nickname or User ID
 73 |   -d, --destination TEXT          Directory to save images  [default:
 74 |                                   weibo_images/]
 75 | 
 76 |   -o, --overwrite                 Overwrite existing files  [default: False]
 77 |   -t, --thumbnail                 Download thumbnails with a maximum width of
 78 |                                   690px  [default: False]
 79 | 
 80 |   -n, --max-images INTEGER        Maximum number of images to download
 81 |                                   [default: 2000]
 82 | 
 83 |   -w, --max-workers INTEGER       Maximum thread workers  [default: 15]
 84 |   -P, --proxies TEXT              Use proxies to access websites. Example:
 85 |                                   '{"http":
 86 |                                   "user:passwd@www.example.com:port", "https":
 87 |                                   "user:passwd@www.example.com:port"}'
 88 | 
 89 |   --help                          Show this message and exit.
 90 | 
 91 | ```
 92 | 
 93 | # 更新历史
 94 | 
 95 | - ## Version 0.1.2 (2021-11-13)
 96 | 
 97 |     - 修复查询用户信息失败，导致无法继续下载的问题
 98 | 
 99 | - ## Version 0.1.1 (2021-08-26)
100 | 
101 |     新功能：
102 |     
103 |     - 支持使用用户 ID（User ID）下载：`python main.py -u <user id>`
104 | 
105 | - ## Version 0.1.0 (2021-05-16)
106 |     
107 |     - 调整代码结构
108 |     - 修复偶发的图片下载不完整的问题
109 |     - 修复下载总量与给定量不一致的问题
110 | 
111 | - ## Version 0.1.0a (2020-03-29)
112 | 
113 |     主要功能：
114 |     
115 |     - 极速下载：多线程异步下载，可以根据需要设置线程数
116 |     - 异常重试：只要重试次数足够多，就没有下载不下来的图片 \(^o^)/！
117 |     - 增量下载：用户有新的上传，再跑一遍程序就行了 O(∩_∩)O 嗯！
118 |     - 高清原图：默认下载高清原图，可以使用参数 `--thumbnail` 下载缩略图（宽最大 690px）
119 | 
120 | # LICENSE
121 | 
122 | 此项目使用 [MIT](LICENSE) 开源协议
123 | 
124 | **注意**：使用此工具下载的所有内容，版权归原作者所有，请谨慎使用！
125 | 


--------------------------------------------------------------------------------
/docs/README_CN.md:
--------------------------------------------------------------------------------
  1 | # Weibo Image Spider
  2 | 
  3 | 微博图片爬虫，极速下载、高清原图、多种命令、简单实用。
  4 | 
  5 | ### 特点：
  6 | 
  7 | - [x] 极速下载：多线程异步下载，可以根据需要设置线程数
  8 | - [x] 异常重试：只要重试次数足够多，就没有下载不下来的图片 \(^o^)/！
  9 | - [x] 增量下载：用户有新的上传，再跑一遍程序就行了 O(∩_∩)O 嗯！
 10 | - [x] 高清原图：默认下载高清原图，可以使用参数 `--thumbnail` 下载缩略图（宽最大 690px）
 11 | 
 12 | ### 环境：
 13 | 
 14 | - `python3.6` 及以上
 15 | 
 16 | # 快速使用
 17 | 
 18 | ## 1. 克隆项目到本地
 19 | 
 20 | ```sh
 21 | $ git clone https://github.com/lonsty/weibo-image-spider.git
 22 | ```
 23 | 
 24 | ## 2. 安装依赖包
 25 | 
 26 | ```sh
 27 | $ cd weibo-image-spider
 28 | $ pip install -r requirements.txt
 29 | ```
 30 | 
 31 | ## 3. 快速使用
 32 | 
 33 | **注意**：
 34 | 
 35 | *因网页版微博限制，使用爬虫请求其 API 时，需要 cookie 认证，关于 [如何获取 cookie](get_cookie.md)？
 36 | 且 cookie 有效期为一天（第二天零点失效），所以最好不要跨天爬取。*
 37 | 
 38 | 下载用户昵称为 `nickname` 的最新 2000（可使用 `-n` 修改） 张图片到路径 `dest` 下：
 39 | 
 40 | ```sh
 41 | $ python main.py -u <nickname> -d <dest>
 42 | ```
 43 | 
 44 | 运行截图
 45 | 
 46 | ![screenshot_1.png](docs/screenshot_1.png)
 47 | 
 48 | 爬取结果
 49 | 
 50 | ![screenshot_2.png](docs/screenshot_2.png)
 51 | 
 52 | # 使用帮助
 53 | 
 54 | ### 常用命令
 55 | 
 56 | - 部分图片 **下载失败** 或 **微博有更新**，再执行相同的命令，对失败或新增的图片进行下载
 57 | 
 58 | ```sh
 59 | $ python main.py -u <nickname> -d <dest>
 60 | ```
 61 | 
 62 | ### 查看所有命令
 63 | 
 64 | ```
 65 | $ python main.py --help
 66 | 
 67 | Usage: main.py [OPTIONS]
 68 | 
 69 |   A Weibo image spider, visit https://github.com/lonsty/weibo-image-spider.
 70 | 
 71 | Options:
 72 |   -u, --nickname TEXT        Nickname
 73 |   -d, --destination TEXT     Directory to save images  [default:
 74 |                              weibo_images/]
 75 | 
 76 |   -o, --overwrite            Overwrite existing files  [default: False]
 77 |   -t, --thumbnail            Download thumbnails with a maximum width of 690px
 78 |                              [default: False]
 79 | 
 80 |   -n, --max-images INTEGER   Maximum number of images to download  [default:
 81 |                              2000]
 82 | 
 83 |   -w, --max-workers INTEGER  Maximum thread workers  [default: 15]
 84 |   -P, --proxies TEXT         Use proxies to access websites. Example:
 85 |                              '{"http": "user:passwd@www.example.com:port",
 86 |                              "https": "user:passwd@www.example.com:port"}'
 87 | 
 88 |   --help                     Show this message and exit.
 89 | ```
 90 | 
 91 | # 更新历史
 92 | 
 93 | - ## Version 0.1.0a (2020-03-29)
 94 | 
 95 |     主要功能：
 96 |     
 97 |     - 极速下载：多线程异步下载，可以根据需要设置线程数
 98 |     - 异常重试：只要重试次数足够多，就没有下载不下来的图片 \(^o^)/！
 99 |     - 增量下载：用户有新的上传，再跑一遍程序就行了 O(∩_∩)O 嗯！
100 |     - 高清原图：默认下载高清原图，可以使用参数 `--thumbnail` 下载缩略图（宽最大 690px）
101 | 
102 | # LICENSE
103 | 
104 | 此项目使用 [MIT](LICENSE) 开源协议
105 | 
106 | **注意**：使用此工具下载的所有内容，版权归原作者所有，请谨慎使用！
107 | 


--------------------------------------------------------------------------------
/docs/get_cookie.md:
--------------------------------------------------------------------------------
1 | ## 获取网页版微博的 cookie：
2 | 
3 | 1. 前往微博主页 [https://www.weibo.com/](https://www.weibo.com/)，并使用个人账号完成登录；
4 | 
5 | 2. 以 Google Chrome 浏览器为例，按 <kbd>F12</kbd> 打开开发者模式，依次点击「Network」→「XHR」，然后按 <kbd>F5</kbd> 刷新，在 XHR 记录中随便选中一条，点开 Headers，复制 Request Headers 中的 Cookie 值，将其粘贴在根目录的 [cookie](../cookie) 文件中，覆盖原内容即可。
6 | 
7 | ![get_cookie](get_cookie.png)
8 | 


--------------------------------------------------------------------------------
/docs/get_cookie.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lonsty/weibo-image-spider/c7dae38b51209296cc8e71aa6fb80f094d549198/docs/get_cookie.png


--------------------------------------------------------------------------------
/docs/screenshot_1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lonsty/weibo-image-spider/c7dae38b51209296cc8e71aa6fb80f094d549198/docs/screenshot_1.png


--------------------------------------------------------------------------------
/docs/screenshot_2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lonsty/weibo-image-spider/c7dae38b51209296cc8e71aa6fb80f094d549198/docs/screenshot_2.png


--------------------------------------------------------------------------------
/main.py:
--------------------------------------------------------------------------------
1 | # @AUTHOR : lonsty
2 | # @DATE : 2020/3/28 14:24
3 | from weibo_image_spider.cli import weibo_command
4 | 
5 | if __name__ == "__main__":
6 |     weibo_command()
7 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | beautifulsoup4==4.8.2
2 | click==7.1.1
3 | termcolor==1.1.0
4 | requests==2.23.0
5 | pydantic==1.4
6 | 


--------------------------------------------------------------------------------
/weibo_image_spider/__init__.py:
--------------------------------------------------------------------------------
1 | # @AUTHOR : lonsty
2 | # @DATE : 2020/3/28 14:22
3 | from .spider_workers import crawl_worker, download_worker, query_user_by_name
4 | 
5 | __author__ = "Allen Shaw"
6 | __version__ = "0.1.0"
7 | 
8 | __all__ = []
9 | 


--------------------------------------------------------------------------------
/weibo_image_spider/cli.py:
--------------------------------------------------------------------------------
  1 | # @AUTHOR : lonsty
  2 | # @DATE : 2020/3/28 18:46
  3 | import json
  4 | import logging
  5 | from concurrent.futures import ThreadPoolExecutor, wait
  6 | 
  7 | import click
  8 | from pydantic import ValidationError
  9 | from requests.exceptions import ConnectionError, RequestException
 10 | from termcolor import colored
 11 | 
 12 | from weibo_image_spider.constants import Constant
 13 | from weibo_image_spider.models import Parameters, PhotoAPI
 14 | from weibo_image_spider.spider_workers import crawl_worker, download_worker, query_user_by_name
 15 | from weibo_image_spider.utils import mkdirs_if_not_exist, quit, save_records
 16 | 
 17 | 
 18 | @click.command(help="A Weibo image spider, visit https://github.com/lonsty/weibo-image-spider.")
 19 | @click.option("-u", "--nickname", "nickname", help="Nickname")
 20 | @click.option(
 21 |     "-d", "--destination", "destination", default="weibo_images/", show_default=True, help="Directory to save images"
 22 | )
 23 | @click.option(
 24 |     "-o", "--overwrite", "overwrite", is_flag=True, default=False, show_default=True, help="Overwrite existing files"
 25 | )
 26 | @click.option(
 27 |     "-t",
 28 |     "--thumbnail",
 29 |     "thumbnail",
 30 |     is_flag=True,
 31 |     default=False,
 32 |     show_default=True,
 33 |     help="Download thumbnails with a maximum width of 690px",
 34 | )
 35 | @click.option(
 36 |     "-n",
 37 |     "--max-images",
 38 |     "max_images",
 39 |     default=2000,
 40 |     show_default=True,
 41 |     type=int,
 42 |     help="Maximum number of images to download",
 43 | )
 44 | @click.option(
 45 |     "-w", "--max-workers", "max_workers", default=15, show_default=True, type=int, help="Maximum thread workers"
 46 | )
 47 | @click.option(
 48 |     "-P",
 49 |     "--proxies",
 50 |     "proxies_raw",
 51 |     help="Use proxies to access websites.\nExample:\n'"
 52 |     '{"http": "user:password@example.com:port",\n'
 53 |     '"https": "user:password@example.com:port"}\'',
 54 | )
 55 | @click.option(
 56 |     "-v",
 57 |     "--verbose",
 58 |     "verbose",
 59 |     is_flag=True,
 60 |     help="Show more information for debugging",
 61 | )
 62 | def weibo_command(**kwargs):
 63 |     try:
 64 |         paras = Parameters(**kwargs)
 65 |         const = Constant(**paras.dict())
 66 |     except ValidationError as e:
 67 |         quit("Invalid arguments: " + ", ".join([f'{a["loc"][0]} - {a["msg"]}' for a in json.loads(e.json())]), 1)
 68 | 
 69 |     logging.basicConfig(
 70 |         level=logging.INFO if const.verbose else logging.ERROR,
 71 |         format="[%(asctime)s %(threadName)-23s %(levelname)-5s %(lineno)3d] %(message)s",
 72 |     )
 73 | 
 74 |     try:
 75 |         const.user = query_user_by_name(const)
 76 |     except (ConnectionError, RequestException) as e:
 77 |         quit(f"Network error: {e}", 1)
 78 | 
 79 |     mkdirs_if_not_exist(const.saved_dir)
 80 |     print(
 81 |         f"\n - - - - - -+-+ {const.status.start_time_repr} +-+- - - - - -\n"
 82 |         f'   Nickname: {colored(const.user.name, "cyan")}\n'
 83 |         f'    User ID: {colored(const.user.uid, "cyan")}\n'
 84 |         f'Destination: {colored(const.saved_dir, attrs=["underline"])}\n'
 85 |         f"  Overwrite: {const.overwrite}\n"
 86 |         f"  Thumbnail: {const.thumbnail}\n"
 87 |         f" Max images: {const.max_images}\n"
 88 |     )
 89 | 
 90 |     const.photo_api = PhotoAPI(
 91 |         action_data=f"type=photo&owner_uid={const.user.uid}&viewer_uid={const.user.uid}" f"&since_id=-1",
 92 |         page_id=int(f"100505{const.user.uid}"),
 93 |         page=1,
 94 |     )
 95 | 
 96 |     with ThreadPoolExecutor(max_workers=const.max_workers + 1) as pool:
 97 |         img_crawler = pool.submit(crawl_worker, const)
 98 |         img_downloader = [pool.submit(download_worker, const) for _ in range(const.max_workers)]
 99 |     wait([img_crawler] + img_downloader)
100 | 
101 |     save_records(const)
102 |     quit("\n\nDownload completed, bye bye ~")
103 | 


--------------------------------------------------------------------------------
/weibo_image_spider/constants.py:
--------------------------------------------------------------------------------
 1 | # @AUTHOR : lonsty
 2 | # @DATE : 2020/3/28 14:27
 3 | import json
 4 | import logging
 5 | import os
 6 | import random
 7 | import re
 8 | import time
 9 | # from random import choice, random
10 | from typing import List
11 | 
12 | from pydantic import BaseModel
13 | from weibo_image_spider.models import PhotoAPI, Status, User
14 | from weibo_image_spider.utils import convert_to_safe_filename, read_cookie
15 | 
16 | 
17 | class Constant(BaseModel):
18 |     search_url: str = "https://s.weibo.com/user?q={user}&Refer=weibo_user"
19 |     search_api: str = "https://s.weibo.com/ajax/topsuggest.php?key={user}&_k={ts}&_t=1&outjson=1&uid={uid}"
20 |     img_hosts: List[str] = ["https://wx1.sinaimg.cn", "https://wx2.sinaimg.cn", "https://wx3.sinaimg.cn"]
21 |     cookies_raw: str = ""
22 |     user: User = User()
23 |     photo_api: PhotoAPI = PhotoAPI()
24 |     status: Status = Status()
25 |     nickname: str = "lonsty"
26 |     destination: str = "weibo_images"
27 |     overwrite: bool = False
28 |     thumbnail: bool = False
29 |     max_images: int = 2000
30 |     max_workers: int = 15
31 |     proxies_raw: str = None
32 |     timeout: int = 10
33 |     cancel: bool = False
34 |     end_crawler: bool = False
35 |     verbose: bool = False
36 | 
37 |     def __init__(self, **kargs):
38 |         super(Constant, self).__init__(**kargs)
39 |         self.cookies_raw = read_cookie()
40 | 
41 |     @property
42 |     def cookies(self):
43 |         try:
44 |             return dict([item.split("=")[0], item.split("=")[1]] for item in self.cookies_raw.split("; "))
45 |         except Exception as e:
46 |             logging.warning(e)
47 |             return None
48 | 
49 |     @property
50 |     def img_url_prefix(self):
51 |         return f'{random.choice(self.img_hosts)}/{"large" if not self.thumbnail else "mw690"}/'
52 | 
53 |     @property
54 |     def saved_dir(self):
55 |         return os.path.join(os.path.abspath(self.destination), convert_to_safe_filename(self.user.name))
56 | 
57 |     @property
58 |     def rex_pattern(self):
59 |         return re.compile("(?<=/)\w*?\.(?:jpg|gif)", re.IGNORECASE)
60 | 
61 |     @property
62 |     def user_photo_api(self):
63 |         return self.photo_api.api
64 | 
65 |     @property
66 |     def user_search_api(self):
67 |         return self.search_api.format(
68 |             user=self.nickname, ts=int(time.time() * 1000), uid=random.randrange(1_000_000_000, 9_999_999_999)
69 |         )
70 | 
71 |     @property
72 |     def proxies(self):
73 |         if isinstance(self.proxies_raw, str):
74 |             try:
75 |                 return json.loads(self.proxies_raw)
76 |             except Exception as e:
77 |                 logging.warning(f"Proxy will not be used: {e}")
78 |                 return None
79 |         return None
80 | 


--------------------------------------------------------------------------------
/weibo_image_spider/exceptions.py:
--------------------------------------------------------------------------------
 1 | # @AUTHOR : lonsty
 2 | # @DATE : 2020/3/28 18:01
 3 | 
 4 | 
 5 | class CookiesExpiredException(Exception):
 6 |     pass
 7 | 
 8 | 
 9 | class NoImagesException(Exception):
10 |     pass
11 | 
12 | 
13 | class ContentParserError(Exception):
14 |     pass
15 | 
16 | 
17 | class UserNotFound(Exception):
18 |     pass
19 | 


--------------------------------------------------------------------------------
/weibo_image_spider/models.py:
--------------------------------------------------------------------------------
 1 | # @AUTHOR : lonsty
 2 | # @DATE : 2020/3/28 15:30
 3 | import time
 4 | from datetime import datetime
 5 | from queue import Queue
 6 | 
 7 | from pydantic import BaseModel
 8 | from termcolor import colored
 9 | 
10 | downloading_jobs = Queue()
11 | appointment_jobs = Queue()
12 | 
13 | 
14 | class User(BaseModel):
15 |     name = ""
16 |     uid: int = 0
17 |     host: str = ""
18 | 
19 | 
20 | class PhotoAPI(BaseModel):
21 |     action_data: str = ""
22 |     page_id: int = 0
23 |     page: int = 1
24 | 
25 |     @property
26 |     def api(self):
27 |         return (
28 |             f"https://weibo.com/p/aj/album/loading?ajwvr=6&{self.action_data}"
29 |             f"&page_id={self.page_id}&page={self.page}&ajax_call=1&__rnd={self.rnd}"
30 |         )
31 | 
32 |     @property
33 |     def rnd(self):
34 |         return int(time.time() * 1000)
35 | 
36 | 
37 | class Parameters(BaseModel):
38 |     nickname = ""
39 |     uid: int = 0
40 |     destination: str
41 |     overwrite: bool
42 |     thumbnail: bool
43 |     max_images: int
44 |     max_workers: int
45 |     verbose: bool
46 | 
47 | 
48 | class Status(BaseModel):
49 |     succeed = []
50 |     failed = []
51 |     start_time = datetime.now()
52 | 
53 |     @property
54 |     def total_complete(self):
55 |         return len(self.succeed) + len(self.failed)
56 | 
57 |     @property
58 |     def start_time_repr(self):
59 |         return self.start_time.ctime()
60 | 
61 |     @property
62 |     def time_used(self):
63 |         return str(datetime.now() - self.start_time)[:-7]
64 | 
65 |     @property
66 |     def fmt_status(self):
67 |         return (
68 |             f'[Succeed: {colored(str(len(self.succeed)), "green")}, '
69 |             f'Failed: {colored(str(len(self.failed)), "red")}]'
70 |         )
71 | 


--------------------------------------------------------------------------------
/weibo_image_spider/spider_workers.py:
--------------------------------------------------------------------------------
  1 | # @AUTHOR : lonsty
  2 | # @DATE : 2020/3/28 14:24
  3 | import logging
  4 | import os
  5 | import queue
  6 | import threading
  7 | 
  8 | from bs4 import BeautifulSoup
  9 | from requests import Session
 10 | from requests.exceptions import ConnectionError, RequestException
 11 | from termcolor import colored
 12 | 
 13 | from .constants import Constant
 14 | from .exceptions import ContentParserError, CookiesExpiredException, NoImagesException, UserNotFound
 15 | from .models import User, appointment_jobs, downloading_jobs
 16 | from .utils import get_session, retry, save_cookie
 17 | 
 18 | lock = threading.RLock()
 19 | 
 20 | 
 21 | @retry(logger=logging)
 22 | def query_user_by_name(const: Constant):
 23 |     session = get_session()
 24 | 
 25 |     try:
 26 |         logging.info(f"Getting information of username: {const.nickname}...")
 27 |         resp = session.get(const.user_search_api, cookies=const.cookies, proxies=const.proxies, timeout=const.timeout)
 28 |         resp.raise_for_status()
 29 |     except Exception as e:
 30 |         logging.info(f"Getting user information error: {e}")
 31 |         raise ConnectionError(e)
 32 | 
 33 |     try:
 34 |         logging.info("Initialing a BeautifulSoup...")
 35 |         first = resp.json()["user"][0]
 36 |         name = first["u_name"]
 37 |         uid = first["u_id"]
 38 |     except (KeyError, IndexError) as e:
 39 |         logging.info(f"Parsing user information error: {e}")
 40 |         raise ContentParserError(
 41 |             "Weibo API updated, please add a issue " "to https://github.com/lonsty/weibo-image-spider/issues."
 42 |         )
 43 |     user = User(name=name, host=f"https://weibo.com/u/{uid}", uid=uid)
 44 |     logging.info(f"Got information of username: {const.nickname}, {user}")
 45 | 
 46 |     return user
 47 | 
 48 | 
 49 | @retry((RequestException, CookiesExpiredException), logger=logging)
 50 | def crawl_image(const: Constant, url: str, session: Session):
 51 |     try:
 52 |         logging.info(f"Getting urls from page...")
 53 |         resp = session.get(url, cookies=const.cookies, proxies=const.proxies, timeout=const.timeout)
 54 |         resp.raise_for_status()
 55 |     except Exception as e:
 56 |         logging.info(f"Getting urls from page error: {e}")
 57 |         raise RequestException(e)
 58 | 
 59 |     try:
 60 |         logging.info(f"Parsing urls from page...")
 61 |         soup = BeautifulSoup(resp.json().get("data"), "html.parser")
 62 |         boxes = soup.find_all("a", class_="ph_ar_box")
 63 |         for box in boxes:
 64 |             img = const.rex_pattern.search(box.find("img").get("src")).group(0)
 65 |             downloading_jobs.put(img)
 66 |         logging.info(f"Parsed {len(boxes)} urls from page")
 67 |     except Exception as e:
 68 |         logging.info(f"Parsing urls from page error: {e}")
 69 |         raise CookiesExpiredException("Cookie has expired, please get a new one and paste to here:\n")
 70 | 
 71 |     logging.info(f"Parsing action-data from page...")
 72 |     card = soup.find("div", class_="WB_cardwrap")
 73 |     if not card:
 74 |         logging.info(f"No action-data in page")
 75 |         raise NoImagesException("No more images to crawl")
 76 | 
 77 |     action_data = card.get("action-data")
 78 |     const.photo_api.action_data = action_data
 79 |     logging.info(f"Got action-data from page: {action_data}")
 80 | 
 81 | 
 82 | def crawl_worker(const: Constant):
 83 |     page = 1
 84 |     session = get_session()
 85 | 
 86 |     while appointment_jobs.qsize() < const.max_images:
 87 |         const.photo_api.page = page
 88 |         try:
 89 |             logging.info(f"Crawling page {page}...")
 90 |             crawl_image(const, const.user_photo_api, session)
 91 |             logging.info(f"Crawled page {page}")
 92 |         except CookiesExpiredException as e:
 93 |             logging.info(f"Cookies has expired, need new cookies")
 94 |             const.cookies_raw = input(str(e))
 95 |             save_cookie(const.cookies_raw)
 96 |             logging.info(f"Saved new cookies")
 97 |             continue
 98 |         except (NoImagesException, Exception) as e:
 99 |             logging.info(f"Crawling page: {e}")
100 |             break
101 |         page += 1
102 |     const.end_crawler = True
103 | 
104 | 
105 | @retry(logger=logging)
106 | def download_image(const: Constant, img: str, session: Session):
107 |     url = const.img_url_prefix + img
108 |     filename = os.path.join(const.saved_dir, img)
109 | 
110 |     if (not const.overwrite) and os.path.isfile(filename):
111 |         logging.info(f"Skipped downloaded image: {filename}")
112 |         return url
113 | 
114 |     try:
115 |         logging.info(f"Heading image...")
116 |         head = session.get(url, cookies=const.cookies, proxies=const.proxies, timeout=const.timeout)
117 |         head.raise_for_status()
118 |         image_size = int(head.headers["Content-Length"].strip())
119 |         logging.info(f"Got image: {url} size: {image_size}")
120 |     except Exception as e:
121 |         logging.info(f"Heading image error: {e}")
122 |         raise RequestException(e)
123 | 
124 |     try:
125 |         logging.info(f"Downloading image...")
126 |         resp = session.get(url, cookies=const.cookies, proxies=const.proxies, stream=True, timeout=const.timeout)
127 |         resp.raise_for_status()
128 |         logging.info(f"Downloaded image")
129 |     except Exception as e:
130 |         logging.info(f"Downloading image error: {e}")
131 |         raise RequestException(e)
132 | 
133 |     write_size = 0
134 |     with open(filename, "wb") as f:
135 |         for chunk in resp.iter_content(chunk_size=8192):
136 |             f.write(chunk)
137 |             write_size += len(chunk)
138 | 
139 |     if write_size < image_size:
140 |         os.remove(filename)
141 |         logging.info(f"Saving image error: image is incomplete")
142 |         raise RequestException("The downloaded image is incomplete")
143 |     logging.info(f"Saved image: {filename}")
144 | 
145 |     return url
146 | 
147 | 
148 | def download_worker(const: Constant):
149 |     session = get_session()
150 | 
151 |     while appointment_jobs.qsize() < const.max_images:
152 |         try:
153 |             img = downloading_jobs.get_nowait()
154 |             with lock:
155 |                 if appointment_jobs.qsize() < const.max_images:
156 |                     appointment_jobs.put(img)
157 |                 else:
158 |                     break
159 |             logging.info(f"Download worker start...")
160 |             result = download_image(const, img, session)
161 |         except queue.Empty:
162 |             if const.cancel or const.end_crawler:
163 |                 break
164 |         except Exception as e:
165 |             logging.info(f"Download worker error: {e}")
166 |             result = const.img_url_prefix + img
167 |             const.status.failed.append(result)
168 |             print(
169 |                 f'{colored("[x]", "red", attrs=["reverse"])} {colored(result, attrs=["underline"])}\t'
170 |                 f"{const.status.fmt_status}",
171 |                 end="\r" if not const.verbose else "\n",
172 |                 flush=True,
173 |             )
174 |         else:
175 |             logging.info(f"Download worker succeed")
176 |             const.status.succeed.append(result)
177 |             print(
178 |                 f'{colored("[√]", "green", attrs=["reverse"])} {colored(result, attrs=["underline"])}\t'
179 |                 f"{const.status.fmt_status}",
180 |                 end="\r" if not const.verbose else "\n",
181 |                 flush=True,
182 |             )
183 | 


--------------------------------------------------------------------------------
/weibo_image_spider/utils.py:
--------------------------------------------------------------------------------
  1 | # @AUTHOR : lonsty
  2 | # @DATE : 2020/3/28 14:23
  3 | import json
  4 | import os
  5 | import random
  6 | import sys
  7 | import threading
  8 | import time
  9 | from functools import wraps
 10 | 
 11 | from requests import Session
 12 | 
 13 | thread_local = threading.local()
 14 | 
 15 | 
 16 | def cookies_from_raw(raw):
 17 |     return dict([line.split("=")[0], line.split("=")[1]] for line in raw.split("; "))
 18 | 
 19 | 
 20 | def get_session():
 21 |     if not hasattr(thread_local, "session"):
 22 |         thread_local.session = Session()
 23 |     return thread_local.session
 24 | 
 25 | 
 26 | def retry(exceptions=Exception, tries=3, delay=1, backoff=2, logger=None):
 27 |     """
 28 |     Retry calling the decorated function using an exponential backoff.
 29 |     Args:
 30 |         exceptions: The exception to check. may be a tuple of
 31 |             exceptions to check.
 32 |         tries: Number of times to try (not retry) before giving up.
 33 |         delay: Initial delay between retries in seconds.
 34 |         backoff: Backoff multiplier (e.g. value of 2 will double the delay
 35 |             each retry).
 36 |         logger: Logger to use. If None, print.
 37 |     """
 38 | 
 39 |     def deco_retry(f):
 40 |         @wraps(f)
 41 |         def f_retry(*args, **kwargs):
 42 |             mtries, mdelay = tries, delay or random.uniform(0.5, 1.5)
 43 |             while mtries > 1:
 44 |                 try:
 45 |                     return f(*args, **kwargs)
 46 |                 except exceptions as e:
 47 |                     if logger:
 48 |                         logger.error("{}, Retrying in {} seconds...".format(e, mdelay))
 49 |                     else:
 50 |                         print("\n{}, Retrying in {} seconds...".format(e, mdelay))
 51 |                     time.sleep(mdelay)
 52 |                     mtries -= 1
 53 |                     mdelay *= backoff
 54 |             return f(*args, **kwargs)
 55 | 
 56 |         return f_retry
 57 | 
 58 |     return deco_retry
 59 | 
 60 | 
 61 | def mkdirs_if_not_exist(dir):
 62 |     if not os.path.isdir(dir):
 63 |         try:
 64 |             os.makedirs(dir)
 65 |         except FileExistsError:
 66 |             pass
 67 | 
 68 | 
 69 | def convert_to_safe_filename(filename):
 70 |     return "".join([c for c in filename if c not in r'\/:*?"<>|']).strip()
 71 | 
 72 | 
 73 | def read_cookie():
 74 |     with open("cookie", "r") as f:
 75 |         return f.read().strip()
 76 | 
 77 | 
 78 | def save_cookie(cookie):
 79 |     with open("cookie", "w") as f:
 80 |         f.write(cookie)
 81 | 
 82 | 
 83 | def quit(msg, code=0):
 84 |     print(msg)
 85 |     sys.exit(code)
 86 | 
 87 | 
 88 | def save_records(c):
 89 |     filename = os.path.join(c.saved_dir, c.status.start_time.strftime("%Y-%m-%d_%H-%M-%S") + ".json")
 90 |     with open(filename, "w") as f:
 91 |         f.write(
 92 |             json.dumps(
 93 |                 {
 94 |                     "nickname": c.user.name,
 95 |                     "uid": c.user.uid,
 96 |                     "datetime": c.status.start_time_repr,
 97 |                     "succeed": {"count": len(c.status.succeed), "urls": c.status.succeed},
 98 |                     "failed": {"count": len(c.status.failed), "urls": c.status.failed},
 99 |                 },
100 |                 ensure_ascii=False,
101 |                 indent=2,
102 |             )
103 |         )
104 | 


--------------------------------------------------------------------------------