├── .gitignore
├── LICENSE
├── README-CN.md
├── README.md
├── args.py
├── preview.jpg
├── requirements.txt
├── tumblr-crawler.py
└── utils.py


/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | build/
 12 | develop-eggs/
 13 | dist/
 14 | downloads/
 15 | eggs/
 16 | .eggs/
 17 | lib/
 18 | lib64/
 19 | parts/
 20 | sdist/
 21 | var/
 22 | wheels/
 23 | *.egg-info/
 24 | .installed.cfg
 25 | *.egg
 26 | MANIFEST
 27 | 
 28 | # PyInstaller
 29 | #  Usually these files are written by a python script from a template
 30 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 31 | *.manifest
 32 | *.spec
 33 | 
 34 | # Installer logs
 35 | pip-log.txt
 36 | pip-delete-this-directory.txt
 37 | 
 38 | # Unit test / coverage reports
 39 | htmlcov/
 40 | .tox/
 41 | .coverage
 42 | .coverage.*
 43 | .cache
 44 | nosetests.xml
 45 | coverage.xml
 46 | *.cover
 47 | .hypothesis/
 48 | .pytest_cache/
 49 | 
 50 | # Translations
 51 | *.mo
 52 | *.pot
 53 | 
 54 | # Django stuff:
 55 | *.log
 56 | local_settings.py
 57 | db.sqlite3
 58 | 
 59 | # Flask stuff:
 60 | instance/
 61 | .webassets-cache
 62 | 
 63 | # Scrapy stuff:
 64 | .scrapy
 65 | 
 66 | # Sphinx documentation
 67 | docs/_build/
 68 | 
 69 | # PyBuilder
 70 | target/
 71 | 
 72 | # Jupyter Notebook
 73 | .ipynb_checkpoints
 74 | 
 75 | # pyenv
 76 | .python-version
 77 | 
 78 | # celery beat schedule file
 79 | celerybeat-schedule
 80 | 
 81 | # SageMath parsed files
 82 | *.sage.py
 83 | 
 84 | # Environments
 85 | .env
 86 | .venv
 87 | env/
 88 | venv/
 89 | ENV/
 90 | env.bak/
 91 | venv.bak/
 92 | 
 93 | # Spyder project settings
 94 | .spyderproject
 95 | .spyproject
 96 | 
 97 | # Rope project settings
 98 | .ropeproject
 99 | 
100 | # mkdocs documentation
101 | /site
102 | 
103 | # mypy
104 | .mypy_cache/
105 | 
106 | # add by tzw0745
107 | venv*/
108 | .idea/


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2018 唐志伟
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/README-CN.md:
--------------------------------------------------------------------------------
  1 | ### [English](/README.md) | 简体中文
  2 | 
  3 | # tumblr-crawler-cli
  4 | 高性能&高定制化的Tumblr下载工具。
  5 | ![preview](https://raw.githubusercontent.com/tzw0745/tumblr-crawler-cli/master/preview.jpg)
  6 | 
  7 | # 特性 
  8 | * 丰富的命令行。
  9 | * 支持多线程下载。
 10 | * 确保文件完整性。
 11 | * 支持自定义文件名格式。
 12 | * 支持同时处理多个tumblr站点。
 13 | * 兼容Python2和Python3。
 14 | 
 15 | # 准备工作
 16 | ```shell
 17 | $ git clone git@github.com:tzw0745/tumblr-crawler-cli.git
 18 | $ cd tumblr-crawler-cli
 19 | $ pip install -r requirements.txt  # -i https://pypi.tuna.tsinghua.edu.cn/simple/
 20 | $ python tumblr-crawler.py --help
 21 | ```
 22 | > 大陆用户推荐使用清华大学TUNA的pypi源以提高pip的安装速度；
 23 | 
 24 | > **注意：** 如果想使用socks代理，需要再安装一个第三方模组：**pySocks**
 25 | ```shell
 26 | $ pip install pySocks
 27 | ```
 28 | 
 29 | # 使用方法
 30 | ```shell
 31 | usage: tumblr-crawler.py [-h] [-p] [-v] [-d SAVE_DIR] [-f FN_FMT] [-x PROXY]
 32 |                          [-n THREAD_NUM] [--min MIN_SIZE] [--overwrite]
 33 |                          [--interval INTERVAL] [--retries RETRIES]
 34 |                          sites [sites ...]
 35 | 
 36 | Crawler Tumblr Photos and Videos
 37 | 
 38 | positional arguments:
 39 |   sites                 tumblr sites
 40 | 
 41 | optional arguments:
 42 |   -h, --help            show this help message and exit
 43 |   -p, --photo           whether to download photo
 44 |   -v, --video           whether to download video
 45 |   -d SAVE_DIR, --dir SAVE_DIR
 46 |                         download file save directory
 47 |   -f FN_FMT, --format FN_FMT
 48 |                         filename format
 49 |   -x PROXY, --proxy PROXY
 50 |                         http request agent, support http/socks
 51 |   -n THREAD_NUM, --thread THREAD_NUM
 52 |                         number of download threads, default is 5
 53 |   --min MIN_SIZE        minimum size of downloaded files, default is 0k
 54 |                         (unlimited)
 55 |   --overwrite           overwrite file (if it exists)
 56 |   --interval INTERVAL   http request interval, default is 0.5 (seconds)
 57 |   --retries RETRIES     http request retries, default is 3
 58 | ```
 59 | 
 60 | ## 例子
 61 | * 下载Tumblr [@liamtbyrne](http://liamtbyrne.tumblr.com)和[@lizclimo](http://lizclimo.tumblr.com/)上所有的图片和视频：
 62 | ```shell
 63 | $ python tumblr-crawler.py liamtbyrne lizclimo
 64 | ```
 65 | 
 66 | * 指定下载文件类型：
 67 | ```shell
 68 | $ python tumblr-crawler.py -p liamtbyrne  # 只下载图片
 69 | $ python tumblr-crawler.py --video liamtbyrne  # 只下载视频
 70 | ```
 71 | 
 72 | * 下载文件到其它文件夹：
 73 | ```shell
 74 | $ python tumblr-crawler.py -d /somedir/ liamtbyrne
 75 | ```
 76 | 
 77 | * 设置文件名格式:
 78 | ```shell
 79 | $ python tumblr-crawler.py -f "{date:%Y-%m-%d %H.%M.%S} GMT.{post_id}.{uid}" liamtbyrne  # 默认
 80 | $ python tumblr-crawler.py --format {uid} liamtbyrne
 81 | ```
 82 | > 第一个例子下载文件的名称："2015-10-16 06.04.53 GMT.13126.5pzVb1s7wpcjo10.jpg"
 83 | > 第二个例子下载文件的名称："5pzVb1s7wpcjo10.jpg"
 84 | > `{uid}`是必须的，其它可选参数包括：
 85 | > * `{post_id}`：tumblr post id, 类似`13126`；
 86 | > * `{type}`：`video`或`photo`；
 87 | > * `{date}`：tumblr post的时间日期，支持详细设定；
 88 | > * `{timestamp}`：unix时间戳，比如`1541405838`。
 89 | 
 90 | * 设置网络代理：
 91 | ```shell
 92 | $ python tumblr-crawler.py --proxy http://127.0.0.1:1080 liamtbyrne  # http proxy
 93 | $ python tumblr-crawler.py -x socks5h://127.0.0.1:1080 liamtbyrne  # socket5 proxy
 94 | ```
 95 | 
 96 | * 设置更多下载线程以提高下载速度：
 97 | ```shell
 98 | $ python tumblr-crawler.py -n 20 liamtbyrne
 99 | ```
100 | 
101 | * 只希望下载超过一定大小的文件：
102 | ```shell
103 | $ python tumblr-crawler.py --min 0.5m liamtbyrne  # 只下载超过512k的文件
104 | $ python tumblr-crawler.py --min 100k liamtbyrne  # 只下载超过100k的文件
105 | ```
106 | 
107 | # 待添加的功能
108 | * 配置文件。
109 | * ……
110 | 
111 | # 更新日志
112 | * 2018年12月17日：
113 |   * 增加图片url兼容性。
114 | * 2018年12月05日：
115 |   * 支持inline格式图片url。
116 | * 2018年11月05日：
117 |   * 增加文件名格式设置。
118 | * 2018年10月09日：
119 |   * 修改命令行参数。
120 | * 2018年10月06日：
121 |   * 增加最小文件体积设置。
122 | * 2018年10月04日：
123 |   * 异步&多线程解析tumblr站点；
124 |   * 优化代码结构；
125 |   * 修改命令行参数。
126 | * 2018年10月03日：
127 |   * 优化媒体文件提取兼容性。
128 | * 2018年09月29日：
129 |   * **使用临时文件机制以确保文件被完整下载；**
130 |   * 程序结束时提示文件总数；
131 |   * 修复命令行参数BUG；
132 |   * 修复多线程BUG。
133 | * 2018年09月28日：
134 |   * 第一个版本。
135 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | ### English | [简体中文](/README-CN.md)
  2 | 
  3 | # tumblr-crawler-cli
  4 | Tumblr Download Tool with High Speed and Customization.
  5 | ![preview](https://raw.githubusercontent.com/tzw0745/tumblr-crawler-cli/master/preview.jpg)
  6 | 
  7 | # Feature
  8 | * Rich command line parameters support.
  9 | * Multi-threaded Download support.
 10 | * File will be download completely.
 11 | * Custom filename format.
 12 | * Support crawler multiple sites at the same time.
 13 | * Python2 & Python3 Compatibility.
 14 | 
 15 | # Prepare
 16 | ```shell
 17 | $ git clone git@github.com:tzw0745/tumblr-crawler-cli.git
 18 | $ cd tumblr-crawler-cli
 19 | $ pip install -r requirements.txt  # -i https://pypi.tuna.tsinghua.edu.cn/simple/
 20 | $ python tumblr-crawler.py --help
 21 | ```
 22 | > **NOTICE:** if you want use socks proxy for this program, you need install another package: **pySocks**
 23 | ```shell
 24 | $ pip install pySocks
 25 | ```
 26 | 
 27 | # Usage
 28 | ```shell
 29 | usage: tumblr-crawler.py [-h] [-p] [-v] [-d SAVE_DIR] [-f FN_FMT] [-x PROXY]
 30 |                          [-n THREAD_NUM] [--min MIN_SIZE] [--overwrite]
 31 |                          [--interval INTERVAL] [--retries RETRIES]
 32 |                          sites [sites ...]
 33 | 
 34 | Crawler Tumblr Photos and Videos
 35 | 
 36 | positional arguments:
 37 |   sites                 tumblr sites
 38 | 
 39 | optional arguments:
 40 |   -h, --help            show this help message and exit
 41 |   -p, --photo           whether to download photo
 42 |   -v, --video           whether to download video
 43 |   -d SAVE_DIR, --dir SAVE_DIR
 44 |                         download file save directory
 45 |   -f FN_FMT, --format FN_FMT
 46 |                         filename format
 47 |   -x PROXY, --proxy PROXY
 48 |                         http request agent, support http/socks
 49 |   -n THREAD_NUM, --thread THREAD_NUM
 50 |                         number of download threads, default is 5
 51 |   --min MIN_SIZE        minimum size of downloaded files, default is 0k
 52 |                         (unlimited)
 53 |   --overwrite           overwrite file (if it exists)
 54 |   --interval INTERVAL   http request interval, default is 0.5 (seconds)
 55 |   --retries RETRIES     http request retries, default is 3
 56 | ```
 57 | 
 58 | ## Example
 59 | * you want download all photos and videos from tumblr [@liamtbyrne](http://liamtbyrne.tumblr.com) & [@lizclimo](http://lizclimo.tumblr.com/):
 60 | ```shell
 61 | $ python tumblr-crawler.py liamtbyrne lizclimo
 62 | ```
 63 | 
 64 | * specify the download file type:
 65 | ```shell
 66 | $ python tumblr-crawler.py -p liamtbyrne  # download photos only
 67 | $ python tumblr-crawler.py --video liamtbyrne  # download videos only
 68 | ```
 69 | 
 70 | * you want put download all files to another directory:
 71 | ```shell
 72 | $ python tumblr-crawler.py -d /somedir/ liamtbyrne
 73 | ```
 74 | 
 75 | * you want custom filename format:
 76 | ```shell
 77 | $ python tumblr-crawler.py -f "{date:%Y-%m-%d %H.%M.%S} GMT.{post_id}.{uid}" liamtbyrne  # default
 78 | $ python tumblr-crawler.py --format {uid} liamtbyrne
 79 | ```
 80 | > first example will save file like: "2015-10-16 06.04.53 GMT.13126.5pzVb1s7wpcjo10.jpg"
 81 | > second example will save file like: "5pzVb1s7wpcjo10.jpg"
 82 | > `{uid}` is necessary, other optional parameter include:
 83 | > * `{post_id}`: id of tumblr post, like `13126`;
 84 | > * `{type}`: `video` or `photo`;
 85 | > * `{date}`: datetime of tumblr post, support detailed settings;
 86 | > * `{timestamp}`: unix timestamp, like `1541405838`.
 87 | 
 88 | * you want use proxy for download files:
 89 | ```shell
 90 | $ python tumblr-crawler.py --proxy http://127.0.0.1:1080 liamtbyrne  # http proxy
 91 | $ python tumblr-crawler.py -x socks5h://127.0.0.1:1080 liamtbyrne  # socket5 proxy
 92 | ```
 93 | 
 94 | * you want set more thread to speed up the download speed:
 95 | ```shell
 96 | $ python tumblr-crawler.py -n 20 liamtbyrne
 97 | ```
 98 | 
 99 | * you only want to download files larger than a certain size:
100 | ```shell
101 | $ python tumblr-crawler.py --min 0.5m liamtbyrne  # only download files larger than 512k
102 | $ python tumblr-crawler.py --min 100k liamtbyrne  # only download files larger than 100k
103 | ```
104 | 
105 | # Coming Feature
106 | * Configure file.
107 | * ...
108 | 
109 | # Change log
110 | * 2018-12-17:
111 |   * improve image url compatibility.
112 | * 2018-12-05:
113 |   * support inline photo url.
114 | * 2018-11-05:
115 |   * support custom filename format.
116 | * 2018-10-09:
117 |   * update command line args.
118 | * 2018-10-06:
119 |   * add minimum file size support.
120 | * 2018-10-04:
121 |   * asynchronous & multi-thread parse tumblr site;
122 |   * optimize code structure;
123 |   * modify command line parameters.
124 | * 2018-10-03:
125 |   * optimize media extraction compatibility.
126 | * 2018-09-29:
127 |   * **Add Temporary File Support to make sure file download completely;**
128 |   * add file count hint after program completed;
129 |   * fix args parse bug;
130 |   * fix multi thread bug.
131 | * 2018-09-28:
132 |   * First version.
133 | 


--------------------------------------------------------------------------------
/args.py:
--------------------------------------------------------------------------------
  1 | # coding:utf-8
  2 | """
  3 | Created by tzw0745 at 18-9-28
  4 | """
  5 | import os
  6 | import argparse
  7 | 
  8 | 
  9 | class ReadableDir(argparse.Action):
 10 |     def __call__(self, _parser, namespace, values, option_string=None):
 11 |         prospective_dir = values
 12 |         if not os.path.isdir(prospective_dir):
 13 |             err_msg = 'SAVE_DIR:{} is not a valid path'
 14 |             raise argparse.ArgumentTypeError(err_msg.format(prospective_dir))
 15 |         if os.access(prospective_dir, os.R_OK):
 16 |             setattr(namespace, self.dest, prospective_dir)
 17 |         else:
 18 |             err_msg = 'SAVE_DIR:{} is not a readable dir'
 19 |             raise argparse.ArgumentTypeError(err_msg.format(prospective_dir))
 20 | 
 21 | 
 22 | class LimitInterval(argparse.Action):
 23 |     def __call__(self, _parser, namespace, values, option_string=None):
 24 |         try:
 25 |             interval = float(values)
 26 |         except ValueError:
 27 |             err_msg = 'INTERVAL:{} is not a float number'.format(values)
 28 |             raise argparse.ArgumentTypeError(err_msg.format(values))
 29 |         if not 0.1 <= interval <= 5:
 30 |             err_msg = 'INTERVAL must be between 0.1 and 10.0'
 31 |             raise argparse.ArgumentTypeError(err_msg)
 32 |         setattr(namespace, self.dest, interval)
 33 | 
 34 | 
 35 | class LimitRetries(argparse.Action):
 36 |     def __call__(self, _parser, namespace, values, option_string=None):
 37 |         try:
 38 |             retries = int(values)
 39 |         except ValueError:
 40 |             err_msg = 'RETRIES:{} is not a number'.format(values)
 41 |             raise argparse.ArgumentTypeError(err_msg.format(values))
 42 |         if not 0 <= retries <= 10:
 43 |             err_msg = 'RETRIES must be between 0 and 10'
 44 |             raise argparse.ArgumentTypeError(err_msg)
 45 |         setattr(namespace, self.dest, retries)
 46 | 
 47 | 
 48 | class LimitThread(argparse.Action):
 49 |     def __call__(self, _parser, namespace, values, option_string=None):
 50 |         try:
 51 |             thread = int(values)
 52 |         except ValueError:
 53 |             err_msg = 'THREAD_NUM:{} is not a number'.format(values)
 54 |             raise argparse.ArgumentTypeError(err_msg.format(values))
 55 |         if not 1 <= thread <= 20:
 56 |             err_msg = 'THREAD_NUM must be between 1 and 20'
 57 |             raise argparse.ArgumentTypeError(err_msg)
 58 |         setattr(namespace, self.dest, thread)
 59 | 
 60 | 
 61 | class LimitMinSize(argparse.Action):
 62 |     def __call__(self, _parser, namespace, values, option_string=None):
 63 |         num, unit = values[:-1], values[-1].lower()
 64 |         if unit not in ('k', 'm'):
 65 |             err_msg = 'MIN_SIZE:{} not end with k/K or m/M'
 66 |             raise argparse.ArgumentTypeError(err_msg.format(values))
 67 |         try:
 68 |             min_size = float(num)
 69 |         except ValueError:
 70 |             err_msg = 'MIN_SIZE:{} is not a number'
 71 |             raise argparse.ArgumentTypeError(err_msg.format(num))
 72 |         if min_size < 0:
 73 |             err_msg = 'MIN_SIZE:{} cant be a negative number'
 74 |             raise argparse.ArgumentTypeError(err_msg.format(num))
 75 | 
 76 |         multiple = {'k': 1024, 'm': 1024 * 1024}
 77 |         min_size = int(min_size * multiple[unit])
 78 |         setattr(namespace, self.dest, min_size)
 79 | 
 80 | 
 81 | class CheckFormat(argparse.Action):
 82 |     def __call__(self, _parser, namespace, values, option_string=None):
 83 |         if '{uid}' not in values:
 84 |             err_msg = 'FORMAT:{f} must container {uid}'
 85 |             raise argparse.ArgumentTypeError(err_msg.format(f=values, uid='{uid}'))
 86 |         setattr(namespace, self.dest, values)
 87 | 
 88 | 
 89 | parser = argparse.ArgumentParser(
 90 |     description='Crawler Tumblr Photos and Videos'
 91 | )
 92 | # parser.add_argument(
 93 | #     '-c', '--config', dest='config', type=argparse.FileType('r'),
 94 | #     help='config file path'
 95 | # )
 96 | parser.add_argument(
 97 |     '-p', '--photo', dest='down_photo',
 98 |     action='store_true', help='whether to download photo'
 99 | )
100 | parser.add_argument(
101 |     '-v', '--video', dest='down_video',
102 |     action='store_true', help='whether to download video'
103 | )
104 | parser.add_argument(
105 |     '-d', '--dir', dest='save_dir', action=ReadableDir,
106 |     default='.', help='download file save directory'
107 | )
108 | parser.add_argument(
109 |     '-f', '--format', dest='fn_fmt', action=CheckFormat, help='filename format',
110 |     default='{date:%Y-%m-%d %H.%M.%S} GMT.{post_id}.{uid}'
111 | )
112 | parser.add_argument(
113 |     '-x', '--proxy', dest='proxy',
114 |     help='http request agent, support http/socks'
115 | )
116 | parser.add_argument(
117 |     '-n', '--thread', dest='thread_num', default=5, action=LimitThread,
118 |     help='number of download threads, default is 5'
119 | )
120 | parser.add_argument(
121 |     '--min', dest='min_size', default=0, action=LimitMinSize,
122 |     help='minimum size of downloaded files, default is 0k (unlimited)'
123 | )
124 | parser.add_argument(
125 |     '--overwrite', dest='overwrite', action='store_true',
126 |     help='overwrite file (if it exists)'
127 | )
128 | parser.add_argument(
129 |     '--interval', dest='interval', default=0.5, action=LimitInterval,
130 |     help='http request interval, default is 0.5 (seconds)'
131 | )
132 | parser.add_argument(
133 |     '--retries', dest='retries', default=3, action=LimitRetries,
134 |     help='http request retries, default is 3'
135 | )
136 | parser.add_argument(dest='sites', help='tumblr sites', nargs='+')
137 | 


--------------------------------------------------------------------------------
/preview.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tzw0745/tumblr-crawler-cli/64887a7489f81b6d72d3b673fb0d7812399cbe46/preview.jpg


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | requests
2 | lxml
3 | 


--------------------------------------------------------------------------------
/tumblr-crawler.py:
--------------------------------------------------------------------------------
  1 | # coding:utf-8
  2 | """
  3 | Created by tzw0745 at 18-9-28
  4 | """
  5 | # region import
  6 | import json
  7 | import os
  8 | import re
  9 | import shutil
 10 | import time
 11 | from datetime import datetime
 12 | from threading import Thread
 13 | 
 14 | import requests
 15 | from lxml import etree, html
 16 | 
 17 | try:
 18 |     # Python3 import
 19 |     from urllib.parse import urlsplit
 20 |     from queue import Queue, Empty
 21 | except ImportError:
 22 |     # Python2 import
 23 |     from urlparse import urlparse as urlsplit
 24 |     from Queue import Queue, Empty
 25 | 
 26 | try:
 27 |     # after Python3.2
 28 |     from tempfile import TemporaryDirectory
 29 | 
 30 |     temp_dir = TemporaryDirectory('tumblr_crawler_cli')
 31 | except (ImportError, ImportError):
 32 |     temp_dir = '.tumblr_crawler_cli'
 33 |     os.mkdir(temp_dir) if not os.path.exists(temp_dir) else None
 34 | 
 35 | from args import parser
 36 | from utils import safe_format, clean_fn
 37 | 
 38 | # endregion
 39 | 
 40 | queue_sites = Queue()  # 待解析站点队列
 41 | queue_down = Queue()  # 下载任务队列
 42 | down_stop = False  # 下载停止信号
 43 | cli_args = parser.parse_args()  # 命令行参数
 44 | 
 45 | # 默认全部下载
 46 | if not cli_args.down_photo and not cli_args.down_video:
 47 |     cli_args.down_photo = cli_args.down_video = True
 48 | # 创建http request session并设置代理
 49 | session = requests.session()
 50 | if cli_args.proxy:
 51 |     session.proxies = {'http': cli_args.proxy, 'https': cli_args.proxy}
 52 | # 初始化待解析站点队列
 53 | for _site in cli_args.sites:
 54 |     queue_sites.put(_site)
 55 | 
 56 | # 当post信息非标准格式时解析图片的正则
 57 | photo_regex = re.compile(r'https://\d+.media.tumblr.com/\w{32}/tumblr_[\w.]+')
 58 | 
 59 | 
 60 | def _get(url, params=None, **kwargs):
 61 |     """
 62 |     向目标链接发送一个http请求，requests.get包裹方法
 63 |     :param url:
 64 |     :param params:
 65 |     :param kwargs:
 66 |     :return: requests.Response
 67 |     """
 68 |     global cli_args, session
 69 |     for _retry in range(cli_args.retries):
 70 |         time.sleep(cli_args.interval)
 71 |         try:
 72 |             r = session.get(url, params=params, **kwargs)
 73 |             if r.status_code in (200, 404):
 74 |                 break
 75 |         except requests.exceptions.RequestException:
 76 |             pass
 77 |     else:
 78 |         time.sleep(cli_args.interval)
 79 |         r = session.get(url, params=params, **kwargs)
 80 | 
 81 |     return r
 82 | 
 83 | 
 84 | def parse_site_thread():
 85 |     """
 86 |     添加图片、视频下载任务到下载任务队列
 87 |     """
 88 |     global queue_sites, cli_args, session
 89 |     while not queue_sites.empty():
 90 |         try:
 91 |             site_name = queue_sites.get(block=True, timeout=0.5)
 92 |         except Empty:
 93 |             break
 94 | 
 95 |         print('start crawler tumblr site: {}'.format(site_name))
 96 |         site_dir = os.path.join(cli_args.save_dir, site_name)
 97 |         os.mkdir(site_dir) if not os.path.exists(site_dir) else None
 98 | 
 99 |         global queue_down
100 |         if cli_args.down_photo:
101 |             for post in tumblr_posts(site_name, 'photo', get_method=_get):
102 |                 # 将图片url加入下载队列
103 |                 for photo_url in post['photos']:
104 |                     uid_reg = r'[/_]([a-zA-Z0-9]{8,})_'
105 |                     uid = re.findall(uid_reg, photo_url)[0]
106 |                     args = {'post_id': post['id'], 'type': post['type'],
107 |                             'uid': uid, 'date': post['gmt'],
108 |                             'timestamp': post['timestamp']}
109 |                     ext = re.findall(r'\.[a-zA-Z0-9]{3,}$', photo_url)[0]
110 |                     filename = safe_format(cli_args.fn_fmt, **args) + ext
111 |                     file_path = os.path.join(site_dir, clean_fn(filename))
112 |                     queue_down.put((file_path, photo_url))
113 |         if cli_args.down_video:
114 |             for post in tumblr_posts(site_name, 'video', get_method=_get):
115 |                 # 将视频url加入下载队列
116 |                 uid = re.findall(r'tumblr_([a-zA-Z0-9]{15,})', post['video'])[0]
117 |                 args = {'post_id': post['id'], 'type': post['type'],
118 |                         'uid': uid, 'date': post['gmt'],
119 |                         'timestamp': post['timestamp']}
120 |                 filename = safe_format(cli_args.fn_fmt, **args)
121 |                 ext = '.' + post['ext']
122 |                 file_path = os.path.join(site_dir, clean_fn(filename + ext))
123 |                 queue_down.put((file_path, post['video']))
124 | 
125 | 
126 | def download_thread(thread_name):
127 |     """
128 |     持续从下载任务队列获取任务并下载文件，直到stop_sing为True
129 |     :param thread_name: 线程名称，用于输出
130 |     :return:
131 |     """
132 |     msg = ' '.join(['Thread', str(thread_name), '{}: {}'])
133 |     global queue_down, down_stop, temp_dir, cli_args
134 |     while not down_stop:
135 |         # 从下载任务队列中获取一个任务
136 |         if queue_down.empty():
137 |             continue
138 |         try:
139 |             task_path, task_url = queue_down.get(block=True, timeout=0.5)
140 |         except Empty:
141 |             continue
142 |         # 判断文件是否存在
143 |         if not cli_args.overwrite and os.path.isfile(task_path):
144 |             print(msg.format('Exists', task_path))
145 |             continue
146 |         # 向url发送请求
147 |         try:
148 |             r = _get(task_url, timeout=3)
149 |         except requests.exceptions.RequestException as e:
150 |             # 请求失败
151 |             print(msg.format('RequestException', task_path))
152 |             print(str(e))
153 |             continue
154 |         # 先写入临时文件
155 |         _temp_name = 'tumblr_thread_{}.downloading'.format(thread_name)
156 |         _temp_path = os.path.join(
157 |             temp_dir if isinstance(temp_dir, str) else temp_dir.name,
158 |             _temp_name
159 |         )
160 |         chunk_size = 2 * 1024 * 1024  # 2M缓存
161 |         try:
162 |             with open(_temp_path, 'wb') as f:
163 |                 for content in r.iter_content(chunk_size=chunk_size):
164 |                     f.write(content)
165 |             # 判断文件大小是否符合参数设置
166 |             if cli_args.min_size and \
167 |                     os.path.getsize(_temp_path) < cli_args.min_size:
168 |                 print(msg.format('Too Small', task_path))
169 |                 continue
170 |             # 下载完后再移动到目标目录
171 |             shutil.move(_temp_path, task_path)
172 |         except (IOError, OSError):
173 |             print(msg.format('IO/OSError', _temp_path))
174 |             print(msg.format('IO/OSError', task_path))
175 |             continue
176 |         print(msg.format('Completed', task_path))
177 | 
178 | 
179 | def tumblr_posts(site, post_type, get_method=requests.get):
180 |     """
181 |     获取tumblr站点下所有的图片或视频信息
182 |     :param site: 站点名称
183 |     :param post_type: 文章类型，包括photo和video
184 |     :param get_method: 发送GET请求使用的方法
185 |     :return: 图片或视频信息列表迭代器
186 |     """
187 |     if not re.match(r'^[a-zA-Z0-9_-]+$', site):
188 |         raise ValueError('Param "site" not match "^[a-zA-Z0-9_-]+$"')
189 |     if post_type not in ('photo', 'video'):
190 |         raise ValueError('Param "post_type" must be "photo" or "video"')
191 | 
192 |     def _max_width_sub(node, sub_name):
193 |         """
194 |         获取node下max-width属性最大的子节点的文本
195 |         :param node: xml父节点
196 |         :param sub_name: 子节点名称
197 |         :return: 子节点的文本
198 |         """
199 |         return sorted(
200 |             node.findall(sub_name),
201 |             key=lambda _i: int(_i.get('max-width', '0'))
202 |         )[-1].text
203 | 
204 |     page_size, start = 50, 0
205 |     gmt_fmt = '%Y-%m-%d %H:%M:%S GMT'
206 |     while True:
207 |         api = 'http://{}.tumblr.com/api/read'.format(site)
208 |         params = {'type': post_type, 'num': page_size, 'start': start}
209 |         start += page_size
210 |         # 获取文章列表
211 |         r = get_method(api, params=params, timeout=3)
212 |         if r.status_code == 404:
213 |             raise ValueError('tumblr site "{}" not found'.format(site))
214 |         posts = etree.fromstring(r.content).find('posts').findall('post')
215 |         if not posts:
216 |             break
217 | 
218 |         for post in posts:
219 |             post_info = {
220 |                 'id': post.get('id'),
221 |                 'gmt': datetime.strptime(post.get('date-gmt'), gmt_fmt),
222 |                 'type': post_type,
223 |                 'timestamp': post.get('unix-timestamp')
224 |             }
225 |             if post_type == 'photo':
226 |                 # 获取文章下所有图片链接
227 |                 if post.findall('photo-url'):  # 标准格式
228 |                     photos = []
229 |                     for photo_set in post.iterfind('photoset'):
230 |                         for photo in photo_set.iterfind('photo'):
231 |                             photos.append(_max_width_sub(photo, 'photo-url'))
232 |                     first_photo = _max_width_sub(post, 'photo-url')
233 |                     if first_photo not in photos:
234 |                         photos.append(first_photo)
235 |                 else:  # 非标准格式，用正则
236 |                     photos = photo_regex.findall(''.join(post.itertext()))
237 |                 post_info['photos'] = list(set(photos))
238 |                 yield post_info
239 |             elif post_type == 'video':
240 |                 # 获取视频链接
241 |                 try:
242 |                     video_ext = post.find('video-source').find('extension').text
243 |                 except AttributeError:  # 忽略非标准格式
244 |                     continue
245 |                 tree = html.fromstring(_max_width_sub(post, 'video-player'))
246 |                 options = json.loads(tree.get('data-crt-options'))
247 |                 if not options['hdUrl']:
248 |                     options['hdUrl'] = tree.getchildren()[0].get('src')
249 |                 post_info.update({'video': options['hdUrl'], 'ext': video_ext})
250 |                 yield post_info
251 | 
252 | 
253 | def main():
254 |     global queue_down, down_stop, temp_dir
255 |     # 多线程解析站点（最多3个线程）
256 |     parse_thread_pool = []
257 |     for i in range(min(len(cli_args.sites), 3)):
258 |         _t = Thread(target=parse_site_thread)
259 |         _t.setDaemon(True)
260 |         _t.start()
261 |         parse_thread_pool.append(_t)
262 | 
263 |     # 多线程下载
264 |     down_thread_pool = []
265 |     for i in range(cli_args.thread_num):
266 |         _t = Thread(target=download_thread, args=(i,))
267 |         _t.setDaemon(True)
268 |         _t.start()
269 |         down_thread_pool.append(_t)
270 | 
271 |     # 等待站点解析线程结束
272 |     for thread in parse_thread_pool:
273 |         thread.join()
274 |     # 等待下载任务队列清空
275 |     while not queue_down.empty():
276 |         time.sleep(0.5)
277 |         continue
278 |     # 发送下载停止信号并等待下载线程结束
279 |     down_stop = True
280 |     for thread in down_thread_pool:
281 |         thread.join()
282 | 
283 |     # 移除临时文件夹
284 |     if isinstance(temp_dir, str) and os.path.exists(temp_dir):
285 |         shutil.rmtree(temp_dir)
286 | 
287 | 
288 | if __name__ == '__main__':
289 |     main()
290 | 


--------------------------------------------------------------------------------
/utils.py:
--------------------------------------------------------------------------------
 1 | # coding:utf-8
 2 | """
 3 | Created by tzw0745 at 18-11-5
 4 | """
 5 | import re
 6 | 
 7 | import string
 8 | 
 9 | _formatter = string.Formatter()
10 | 
11 | 
12 | class SafeDict(dict):
13 |     def __missing__(self, key):
14 |         return '{' + key + '}'
15 | 
16 | 
17 | def safe_format(fmt, **kwargs):
18 |     """
19 |     安全字符串格式化，在缺少对应参数时仍能正常工作
20 |     :param fmt: 格式字符串
21 |     :param kwargs: 参数字典
22 |     :return: 格式化后的字符串
23 |     """
24 |     return _formatter.vformat(fmt, (), SafeDict(**kwargs))
25 | 
26 | 
27 | def clean_fn(filename):
28 |     """
29 |     移除文件名中的特殊字符
30 |     :param filename: 文件名
31 |     :return: 处理后的文件名
32 |     """
33 |     return re.sub(r'[\\/:*?"<>|]+', '', filename)
34 | 
35 | 
36 | def main():
37 |     print(safe_format('{id}', uid=111))
38 |     print(clean_fn('2018-10-1 19:00 <|>"\\/:*?".zip'))
39 | 
40 | 
41 | if __name__ == '__main__':
42 |     main()
43 | 


--------------------------------------------------------------------------------