├── .gitignore ├── LICENSE ├── README-CN.md ├── README.md ├── args.py ├── preview.jpg ├── requirements.txt ├── tumblr-crawler.py └── utils.py /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | *.egg-info/ 24 | .installed.cfg 25 | *.egg 26 | MANIFEST 27 | 28 | # PyInstaller 29 | # Usually these files are written by a python script from a template 30 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 31 | *.manifest 32 | *.spec 33 | 34 | # Installer logs 35 | pip-log.txt 36 | pip-delete-this-directory.txt 37 | 38 | # Unit test / coverage reports 39 | htmlcov/ 40 | .tox/ 41 | .coverage 42 | .coverage.* 43 | .cache 44 | nosetests.xml 45 | coverage.xml 46 | *.cover 47 | .hypothesis/ 48 | .pytest_cache/ 49 | 50 | # Translations 51 | *.mo 52 | *.pot 53 | 54 | # Django stuff: 55 | *.log 56 | local_settings.py 57 | db.sqlite3 58 | 59 | # Flask stuff: 60 | instance/ 61 | .webassets-cache 62 | 63 | # Scrapy stuff: 64 | .scrapy 65 | 66 | # Sphinx documentation 67 | docs/_build/ 68 | 69 | # PyBuilder 70 | target/ 71 | 72 | # Jupyter Notebook 73 | .ipynb_checkpoints 74 | 75 | # pyenv 76 | .python-version 77 | 78 | # celery beat schedule file 79 | celerybeat-schedule 80 | 81 | # SageMath parsed files 82 | *.sage.py 83 | 84 | # Environments 85 | .env 86 | .venv 87 | env/ 88 | venv/ 89 | ENV/ 90 | env.bak/ 91 | venv.bak/ 92 | 93 | # Spyder project settings 94 | .spyderproject 95 | .spyproject 96 | 97 | # Rope project settings 98 | .ropeproject 99 | 100 | # mkdocs documentation 101 | /site 102 | 103 | # mypy 104 | .mypy_cache/ 105 | 106 | # add by tzw0745 107 | venv*/ 108 | .idea/ -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2018 唐志伟 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README-CN.md: -------------------------------------------------------------------------------- 1 | ### [English](/README.md) | 简体中文 2 | 3 | # tumblr-crawler-cli 4 | 高性能&高定制化的Tumblr下载工具。 5 | ![preview](https://raw.githubusercontent.com/tzw0745/tumblr-crawler-cli/master/preview.jpg) 6 | 7 | # 特性 8 | * 丰富的命令行。 9 | * 支持多线程下载。 10 | * 确保文件完整性。 11 | * 支持自定义文件名格式。 12 | * 支持同时处理多个tumblr站点。 13 | * 兼容Python2和Python3。 14 | 15 | # 准备工作 16 | ```shell 17 | $ git clone git@github.com:tzw0745/tumblr-crawler-cli.git 18 | $ cd tumblr-crawler-cli 19 | $ pip install -r requirements.txt # -i https://pypi.tuna.tsinghua.edu.cn/simple/ 20 | $ python tumblr-crawler.py --help 21 | ``` 22 | > 大陆用户推荐使用清华大学TUNA的pypi源以提高pip的安装速度; 23 | 24 | > **注意:** 如果想使用socks代理,需要再安装一个第三方模组:**pySocks** 25 | ```shell 26 | $ pip install pySocks 27 | ``` 28 | 29 | # 使用方法 30 | ```shell 31 | usage: tumblr-crawler.py [-h] [-p] [-v] [-d SAVE_DIR] [-f FN_FMT] [-x PROXY] 32 | [-n THREAD_NUM] [--min MIN_SIZE] [--overwrite] 33 | [--interval INTERVAL] [--retries RETRIES] 34 | sites [sites ...] 35 | 36 | Crawler Tumblr Photos and Videos 37 | 38 | positional arguments: 39 | sites tumblr sites 40 | 41 | optional arguments: 42 | -h, --help show this help message and exit 43 | -p, --photo whether to download photo 44 | -v, --video whether to download video 45 | -d SAVE_DIR, --dir SAVE_DIR 46 | download file save directory 47 | -f FN_FMT, --format FN_FMT 48 | filename format 49 | -x PROXY, --proxy PROXY 50 | http request agent, support http/socks 51 | -n THREAD_NUM, --thread THREAD_NUM 52 | number of download threads, default is 5 53 | --min MIN_SIZE minimum size of downloaded files, default is 0k 54 | (unlimited) 55 | --overwrite overwrite file (if it exists) 56 | --interval INTERVAL http request interval, default is 0.5 (seconds) 57 | --retries RETRIES http request retries, default is 3 58 | ``` 59 | 60 | ## 例子 61 | * 下载Tumblr [@liamtbyrne](http://liamtbyrne.tumblr.com)和[@lizclimo](http://lizclimo.tumblr.com/)上所有的图片和视频: 62 | ```shell 63 | $ python tumblr-crawler.py liamtbyrne lizclimo 64 | ``` 65 | 66 | * 指定下载文件类型: 67 | ```shell 68 | $ python tumblr-crawler.py -p liamtbyrne # 只下载图片 69 | $ python tumblr-crawler.py --video liamtbyrne # 只下载视频 70 | ``` 71 | 72 | * 下载文件到其它文件夹: 73 | ```shell 74 | $ python tumblr-crawler.py -d /somedir/ liamtbyrne 75 | ``` 76 | 77 | * 设置文件名格式: 78 | ```shell 79 | $ python tumblr-crawler.py -f "{date:%Y-%m-%d %H.%M.%S} GMT.{post_id}.{uid}" liamtbyrne # 默认 80 | $ python tumblr-crawler.py --format {uid} liamtbyrne 81 | ``` 82 | > 第一个例子下载文件的名称:"2015-10-16 06.04.53 GMT.13126.5pzVb1s7wpcjo10.jpg" 83 | > 第二个例子下载文件的名称:"5pzVb1s7wpcjo10.jpg" 84 | > `{uid}`是必须的,其它可选参数包括: 85 | > * `{post_id}`:tumblr post id, 类似`13126`; 86 | > * `{type}`:`video`或`photo`; 87 | > * `{date}`:tumblr post的时间日期,支持详细设定; 88 | > * `{timestamp}`:unix时间戳,比如`1541405838`。 89 | 90 | * 设置网络代理: 91 | ```shell 92 | $ python tumblr-crawler.py --proxy http://127.0.0.1:1080 liamtbyrne # http proxy 93 | $ python tumblr-crawler.py -x socks5h://127.0.0.1:1080 liamtbyrne # socket5 proxy 94 | ``` 95 | 96 | * 设置更多下载线程以提高下载速度: 97 | ```shell 98 | $ python tumblr-crawler.py -n 20 liamtbyrne 99 | ``` 100 | 101 | * 只希望下载超过一定大小的文件: 102 | ```shell 103 | $ python tumblr-crawler.py --min 0.5m liamtbyrne # 只下载超过512k的文件 104 | $ python tumblr-crawler.py --min 100k liamtbyrne # 只下载超过100k的文件 105 | ``` 106 | 107 | # 待添加的功能 108 | * 配置文件。 109 | * …… 110 | 111 | # 更新日志 112 | * 2018年12月17日: 113 | * 增加图片url兼容性。 114 | * 2018年12月05日: 115 | * 支持inline格式图片url。 116 | * 2018年11月05日: 117 | * 增加文件名格式设置。 118 | * 2018年10月09日: 119 | * 修改命令行参数。 120 | * 2018年10月06日: 121 | * 增加最小文件体积设置。 122 | * 2018年10月04日: 123 | * 异步&多线程解析tumblr站点; 124 | * 优化代码结构; 125 | * 修改命令行参数。 126 | * 2018年10月03日: 127 | * 优化媒体文件提取兼容性。 128 | * 2018年09月29日: 129 | * **使用临时文件机制以确保文件被完整下载;** 130 | * 程序结束时提示文件总数; 131 | * 修复命令行参数BUG; 132 | * 修复多线程BUG。 133 | * 2018年09月28日: 134 | * 第一个版本。 135 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | ### English | [简体中文](/README-CN.md) 2 | 3 | # tumblr-crawler-cli 4 | Tumblr Download Tool with High Speed and Customization. 5 | ![preview](https://raw.githubusercontent.com/tzw0745/tumblr-crawler-cli/master/preview.jpg) 6 | 7 | # Feature 8 | * Rich command line parameters support. 9 | * Multi-threaded Download support. 10 | * File will be download completely. 11 | * Custom filename format. 12 | * Support crawler multiple sites at the same time. 13 | * Python2 & Python3 Compatibility. 14 | 15 | # Prepare 16 | ```shell 17 | $ git clone git@github.com:tzw0745/tumblr-crawler-cli.git 18 | $ cd tumblr-crawler-cli 19 | $ pip install -r requirements.txt # -i https://pypi.tuna.tsinghua.edu.cn/simple/ 20 | $ python tumblr-crawler.py --help 21 | ``` 22 | > **NOTICE:** if you want use socks proxy for this program, you need install another package: **pySocks** 23 | ```shell 24 | $ pip install pySocks 25 | ``` 26 | 27 | # Usage 28 | ```shell 29 | usage: tumblr-crawler.py [-h] [-p] [-v] [-d SAVE_DIR] [-f FN_FMT] [-x PROXY] 30 | [-n THREAD_NUM] [--min MIN_SIZE] [--overwrite] 31 | [--interval INTERVAL] [--retries RETRIES] 32 | sites [sites ...] 33 | 34 | Crawler Tumblr Photos and Videos 35 | 36 | positional arguments: 37 | sites tumblr sites 38 | 39 | optional arguments: 40 | -h, --help show this help message and exit 41 | -p, --photo whether to download photo 42 | -v, --video whether to download video 43 | -d SAVE_DIR, --dir SAVE_DIR 44 | download file save directory 45 | -f FN_FMT, --format FN_FMT 46 | filename format 47 | -x PROXY, --proxy PROXY 48 | http request agent, support http/socks 49 | -n THREAD_NUM, --thread THREAD_NUM 50 | number of download threads, default is 5 51 | --min MIN_SIZE minimum size of downloaded files, default is 0k 52 | (unlimited) 53 | --overwrite overwrite file (if it exists) 54 | --interval INTERVAL http request interval, default is 0.5 (seconds) 55 | --retries RETRIES http request retries, default is 3 56 | ``` 57 | 58 | ## Example 59 | * you want download all photos and videos from tumblr [@liamtbyrne](http://liamtbyrne.tumblr.com) & [@lizclimo](http://lizclimo.tumblr.com/): 60 | ```shell 61 | $ python tumblr-crawler.py liamtbyrne lizclimo 62 | ``` 63 | 64 | * specify the download file type: 65 | ```shell 66 | $ python tumblr-crawler.py -p liamtbyrne # download photos only 67 | $ python tumblr-crawler.py --video liamtbyrne # download videos only 68 | ``` 69 | 70 | * you want put download all files to another directory: 71 | ```shell 72 | $ python tumblr-crawler.py -d /somedir/ liamtbyrne 73 | ``` 74 | 75 | * you want custom filename format: 76 | ```shell 77 | $ python tumblr-crawler.py -f "{date:%Y-%m-%d %H.%M.%S} GMT.{post_id}.{uid}" liamtbyrne # default 78 | $ python tumblr-crawler.py --format {uid} liamtbyrne 79 | ``` 80 | > first example will save file like: "2015-10-16 06.04.53 GMT.13126.5pzVb1s7wpcjo10.jpg" 81 | > second example will save file like: "5pzVb1s7wpcjo10.jpg" 82 | > `{uid}` is necessary, other optional parameter include: 83 | > * `{post_id}`: id of tumblr post, like `13126`; 84 | > * `{type}`: `video` or `photo`; 85 | > * `{date}`: datetime of tumblr post, support detailed settings; 86 | > * `{timestamp}`: unix timestamp, like `1541405838`. 87 | 88 | * you want use proxy for download files: 89 | ```shell 90 | $ python tumblr-crawler.py --proxy http://127.0.0.1:1080 liamtbyrne # http proxy 91 | $ python tumblr-crawler.py -x socks5h://127.0.0.1:1080 liamtbyrne # socket5 proxy 92 | ``` 93 | 94 | * you want set more thread to speed up the download speed: 95 | ```shell 96 | $ python tumblr-crawler.py -n 20 liamtbyrne 97 | ``` 98 | 99 | * you only want to download files larger than a certain size: 100 | ```shell 101 | $ python tumblr-crawler.py --min 0.5m liamtbyrne # only download files larger than 512k 102 | $ python tumblr-crawler.py --min 100k liamtbyrne # only download files larger than 100k 103 | ``` 104 | 105 | # Coming Feature 106 | * Configure file. 107 | * ... 108 | 109 | # Change log 110 | * 2018-12-17: 111 | * improve image url compatibility. 112 | * 2018-12-05: 113 | * support inline photo url. 114 | * 2018-11-05: 115 | * support custom filename format. 116 | * 2018-10-09: 117 | * update command line args. 118 | * 2018-10-06: 119 | * add minimum file size support. 120 | * 2018-10-04: 121 | * asynchronous & multi-thread parse tumblr site; 122 | * optimize code structure; 123 | * modify command line parameters. 124 | * 2018-10-03: 125 | * optimize media extraction compatibility. 126 | * 2018-09-29: 127 | * **Add Temporary File Support to make sure file download completely;** 128 | * add file count hint after program completed; 129 | * fix args parse bug; 130 | * fix multi thread bug. 131 | * 2018-09-28: 132 | * First version. 133 | -------------------------------------------------------------------------------- /args.py: -------------------------------------------------------------------------------- 1 | # coding:utf-8 2 | """ 3 | Created by tzw0745 at 18-9-28 4 | """ 5 | import os 6 | import argparse 7 | 8 | 9 | class ReadableDir(argparse.Action): 10 | def __call__(self, _parser, namespace, values, option_string=None): 11 | prospective_dir = values 12 | if not os.path.isdir(prospective_dir): 13 | err_msg = 'SAVE_DIR:{} is not a valid path' 14 | raise argparse.ArgumentTypeError(err_msg.format(prospective_dir)) 15 | if os.access(prospective_dir, os.R_OK): 16 | setattr(namespace, self.dest, prospective_dir) 17 | else: 18 | err_msg = 'SAVE_DIR:{} is not a readable dir' 19 | raise argparse.ArgumentTypeError(err_msg.format(prospective_dir)) 20 | 21 | 22 | class LimitInterval(argparse.Action): 23 | def __call__(self, _parser, namespace, values, option_string=None): 24 | try: 25 | interval = float(values) 26 | except ValueError: 27 | err_msg = 'INTERVAL:{} is not a float number'.format(values) 28 | raise argparse.ArgumentTypeError(err_msg.format(values)) 29 | if not 0.1 <= interval <= 5: 30 | err_msg = 'INTERVAL must be between 0.1 and 10.0' 31 | raise argparse.ArgumentTypeError(err_msg) 32 | setattr(namespace, self.dest, interval) 33 | 34 | 35 | class LimitRetries(argparse.Action): 36 | def __call__(self, _parser, namespace, values, option_string=None): 37 | try: 38 | retries = int(values) 39 | except ValueError: 40 | err_msg = 'RETRIES:{} is not a number'.format(values) 41 | raise argparse.ArgumentTypeError(err_msg.format(values)) 42 | if not 0 <= retries <= 10: 43 | err_msg = 'RETRIES must be between 0 and 10' 44 | raise argparse.ArgumentTypeError(err_msg) 45 | setattr(namespace, self.dest, retries) 46 | 47 | 48 | class LimitThread(argparse.Action): 49 | def __call__(self, _parser, namespace, values, option_string=None): 50 | try: 51 | thread = int(values) 52 | except ValueError: 53 | err_msg = 'THREAD_NUM:{} is not a number'.format(values) 54 | raise argparse.ArgumentTypeError(err_msg.format(values)) 55 | if not 1 <= thread <= 20: 56 | err_msg = 'THREAD_NUM must be between 1 and 20' 57 | raise argparse.ArgumentTypeError(err_msg) 58 | setattr(namespace, self.dest, thread) 59 | 60 | 61 | class LimitMinSize(argparse.Action): 62 | def __call__(self, _parser, namespace, values, option_string=None): 63 | num, unit = values[:-1], values[-1].lower() 64 | if unit not in ('k', 'm'): 65 | err_msg = 'MIN_SIZE:{} not end with k/K or m/M' 66 | raise argparse.ArgumentTypeError(err_msg.format(values)) 67 | try: 68 | min_size = float(num) 69 | except ValueError: 70 | err_msg = 'MIN_SIZE:{} is not a number' 71 | raise argparse.ArgumentTypeError(err_msg.format(num)) 72 | if min_size < 0: 73 | err_msg = 'MIN_SIZE:{} cant be a negative number' 74 | raise argparse.ArgumentTypeError(err_msg.format(num)) 75 | 76 | multiple = {'k': 1024, 'm': 1024 * 1024} 77 | min_size = int(min_size * multiple[unit]) 78 | setattr(namespace, self.dest, min_size) 79 | 80 | 81 | class CheckFormat(argparse.Action): 82 | def __call__(self, _parser, namespace, values, option_string=None): 83 | if '{uid}' not in values: 84 | err_msg = 'FORMAT:{f} must container {uid}' 85 | raise argparse.ArgumentTypeError(err_msg.format(f=values, uid='{uid}')) 86 | setattr(namespace, self.dest, values) 87 | 88 | 89 | parser = argparse.ArgumentParser( 90 | description='Crawler Tumblr Photos and Videos' 91 | ) 92 | # parser.add_argument( 93 | # '-c', '--config', dest='config', type=argparse.FileType('r'), 94 | # help='config file path' 95 | # ) 96 | parser.add_argument( 97 | '-p', '--photo', dest='down_photo', 98 | action='store_true', help='whether to download photo' 99 | ) 100 | parser.add_argument( 101 | '-v', '--video', dest='down_video', 102 | action='store_true', help='whether to download video' 103 | ) 104 | parser.add_argument( 105 | '-d', '--dir', dest='save_dir', action=ReadableDir, 106 | default='.', help='download file save directory' 107 | ) 108 | parser.add_argument( 109 | '-f', '--format', dest='fn_fmt', action=CheckFormat, help='filename format', 110 | default='{date:%Y-%m-%d %H.%M.%S} GMT.{post_id}.{uid}' 111 | ) 112 | parser.add_argument( 113 | '-x', '--proxy', dest='proxy', 114 | help='http request agent, support http/socks' 115 | ) 116 | parser.add_argument( 117 | '-n', '--thread', dest='thread_num', default=5, action=LimitThread, 118 | help='number of download threads, default is 5' 119 | ) 120 | parser.add_argument( 121 | '--min', dest='min_size', default=0, action=LimitMinSize, 122 | help='minimum size of downloaded files, default is 0k (unlimited)' 123 | ) 124 | parser.add_argument( 125 | '--overwrite', dest='overwrite', action='store_true', 126 | help='overwrite file (if it exists)' 127 | ) 128 | parser.add_argument( 129 | '--interval', dest='interval', default=0.5, action=LimitInterval, 130 | help='http request interval, default is 0.5 (seconds)' 131 | ) 132 | parser.add_argument( 133 | '--retries', dest='retries', default=3, action=LimitRetries, 134 | help='http request retries, default is 3' 135 | ) 136 | parser.add_argument(dest='sites', help='tumblr sites', nargs='+') 137 | -------------------------------------------------------------------------------- /preview.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tzw0745/tumblr-crawler-cli/64887a7489f81b6d72d3b673fb0d7812399cbe46/preview.jpg -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | requests 2 | lxml 3 | -------------------------------------------------------------------------------- /tumblr-crawler.py: -------------------------------------------------------------------------------- 1 | # coding:utf-8 2 | """ 3 | Created by tzw0745 at 18-9-28 4 | """ 5 | # region import 6 | import json 7 | import os 8 | import re 9 | import shutil 10 | import time 11 | from datetime import datetime 12 | from threading import Thread 13 | 14 | import requests 15 | from lxml import etree, html 16 | 17 | try: 18 | # Python3 import 19 | from urllib.parse import urlsplit 20 | from queue import Queue, Empty 21 | except ImportError: 22 | # Python2 import 23 | from urlparse import urlparse as urlsplit 24 | from Queue import Queue, Empty 25 | 26 | try: 27 | # after Python3.2 28 | from tempfile import TemporaryDirectory 29 | 30 | temp_dir = TemporaryDirectory('tumblr_crawler_cli') 31 | except (ImportError, ImportError): 32 | temp_dir = '.tumblr_crawler_cli' 33 | os.mkdir(temp_dir) if not os.path.exists(temp_dir) else None 34 | 35 | from args import parser 36 | from utils import safe_format, clean_fn 37 | 38 | # endregion 39 | 40 | queue_sites = Queue() # 待解析站点队列 41 | queue_down = Queue() # 下载任务队列 42 | down_stop = False # 下载停止信号 43 | cli_args = parser.parse_args() # 命令行参数 44 | 45 | # 默认全部下载 46 | if not cli_args.down_photo and not cli_args.down_video: 47 | cli_args.down_photo = cli_args.down_video = True 48 | # 创建http request session并设置代理 49 | session = requests.session() 50 | if cli_args.proxy: 51 | session.proxies = {'http': cli_args.proxy, 'https': cli_args.proxy} 52 | # 初始化待解析站点队列 53 | for _site in cli_args.sites: 54 | queue_sites.put(_site) 55 | 56 | # 当post信息非标准格式时解析图片的正则 57 | photo_regex = re.compile(r'https://\d+.media.tumblr.com/\w{32}/tumblr_[\w.]+') 58 | 59 | 60 | def _get(url, params=None, **kwargs): 61 | """ 62 | 向目标链接发送一个http请求,requests.get包裹方法 63 | :param url: 64 | :param params: 65 | :param kwargs: 66 | :return: requests.Response 67 | """ 68 | global cli_args, session 69 | for _retry in range(cli_args.retries): 70 | time.sleep(cli_args.interval) 71 | try: 72 | r = session.get(url, params=params, **kwargs) 73 | if r.status_code in (200, 404): 74 | break 75 | except requests.exceptions.RequestException: 76 | pass 77 | else: 78 | time.sleep(cli_args.interval) 79 | r = session.get(url, params=params, **kwargs) 80 | 81 | return r 82 | 83 | 84 | def parse_site_thread(): 85 | """ 86 | 添加图片、视频下载任务到下载任务队列 87 | """ 88 | global queue_sites, cli_args, session 89 | while not queue_sites.empty(): 90 | try: 91 | site_name = queue_sites.get(block=True, timeout=0.5) 92 | except Empty: 93 | break 94 | 95 | print('start crawler tumblr site: {}'.format(site_name)) 96 | site_dir = os.path.join(cli_args.save_dir, site_name) 97 | os.mkdir(site_dir) if not os.path.exists(site_dir) else None 98 | 99 | global queue_down 100 | if cli_args.down_photo: 101 | for post in tumblr_posts(site_name, 'photo', get_method=_get): 102 | # 将图片url加入下载队列 103 | for photo_url in post['photos']: 104 | uid_reg = r'[/_]([a-zA-Z0-9]{8,})_' 105 | uid = re.findall(uid_reg, photo_url)[0] 106 | args = {'post_id': post['id'], 'type': post['type'], 107 | 'uid': uid, 'date': post['gmt'], 108 | 'timestamp': post['timestamp']} 109 | ext = re.findall(r'\.[a-zA-Z0-9]{3,}$', photo_url)[0] 110 | filename = safe_format(cli_args.fn_fmt, **args) + ext 111 | file_path = os.path.join(site_dir, clean_fn(filename)) 112 | queue_down.put((file_path, photo_url)) 113 | if cli_args.down_video: 114 | for post in tumblr_posts(site_name, 'video', get_method=_get): 115 | # 将视频url加入下载队列 116 | uid = re.findall(r'tumblr_([a-zA-Z0-9]{15,})', post['video'])[0] 117 | args = {'post_id': post['id'], 'type': post['type'], 118 | 'uid': uid, 'date': post['gmt'], 119 | 'timestamp': post['timestamp']} 120 | filename = safe_format(cli_args.fn_fmt, **args) 121 | ext = '.' + post['ext'] 122 | file_path = os.path.join(site_dir, clean_fn(filename + ext)) 123 | queue_down.put((file_path, post['video'])) 124 | 125 | 126 | def download_thread(thread_name): 127 | """ 128 | 持续从下载任务队列获取任务并下载文件,直到stop_sing为True 129 | :param thread_name: 线程名称,用于输出 130 | :return: 131 | """ 132 | msg = ' '.join(['Thread', str(thread_name), '{}: {}']) 133 | global queue_down, down_stop, temp_dir, cli_args 134 | while not down_stop: 135 | # 从下载任务队列中获取一个任务 136 | if queue_down.empty(): 137 | continue 138 | try: 139 | task_path, task_url = queue_down.get(block=True, timeout=0.5) 140 | except Empty: 141 | continue 142 | # 判断文件是否存在 143 | if not cli_args.overwrite and os.path.isfile(task_path): 144 | print(msg.format('Exists', task_path)) 145 | continue 146 | # 向url发送请求 147 | try: 148 | r = _get(task_url, timeout=3) 149 | except requests.exceptions.RequestException as e: 150 | # 请求失败 151 | print(msg.format('RequestException', task_path)) 152 | print(str(e)) 153 | continue 154 | # 先写入临时文件 155 | _temp_name = 'tumblr_thread_{}.downloading'.format(thread_name) 156 | _temp_path = os.path.join( 157 | temp_dir if isinstance(temp_dir, str) else temp_dir.name, 158 | _temp_name 159 | ) 160 | chunk_size = 2 * 1024 * 1024 # 2M缓存 161 | try: 162 | with open(_temp_path, 'wb') as f: 163 | for content in r.iter_content(chunk_size=chunk_size): 164 | f.write(content) 165 | # 判断文件大小是否符合参数设置 166 | if cli_args.min_size and \ 167 | os.path.getsize(_temp_path) < cli_args.min_size: 168 | print(msg.format('Too Small', task_path)) 169 | continue 170 | # 下载完后再移动到目标目录 171 | shutil.move(_temp_path, task_path) 172 | except (IOError, OSError): 173 | print(msg.format('IO/OSError', _temp_path)) 174 | print(msg.format('IO/OSError', task_path)) 175 | continue 176 | print(msg.format('Completed', task_path)) 177 | 178 | 179 | def tumblr_posts(site, post_type, get_method=requests.get): 180 | """ 181 | 获取tumblr站点下所有的图片或视频信息 182 | :param site: 站点名称 183 | :param post_type: 文章类型,包括photo和video 184 | :param get_method: 发送GET请求使用的方法 185 | :return: 图片或视频信息列表迭代器 186 | """ 187 | if not re.match(r'^[a-zA-Z0-9_-]+$', site): 188 | raise ValueError('Param "site" not match "^[a-zA-Z0-9_-]+$"') 189 | if post_type not in ('photo', 'video'): 190 | raise ValueError('Param "post_type" must be "photo" or "video"') 191 | 192 | def _max_width_sub(node, sub_name): 193 | """ 194 | 获取node下max-width属性最大的子节点的文本 195 | :param node: xml父节点 196 | :param sub_name: 子节点名称 197 | :return: 子节点的文本 198 | """ 199 | return sorted( 200 | node.findall(sub_name), 201 | key=lambda _i: int(_i.get('max-width', '0')) 202 | )[-1].text 203 | 204 | page_size, start = 50, 0 205 | gmt_fmt = '%Y-%m-%d %H:%M:%S GMT' 206 | while True: 207 | api = 'http://{}.tumblr.com/api/read'.format(site) 208 | params = {'type': post_type, 'num': page_size, 'start': start} 209 | start += page_size 210 | # 获取文章列表 211 | r = get_method(api, params=params, timeout=3) 212 | if r.status_code == 404: 213 | raise ValueError('tumblr site "{}" not found'.format(site)) 214 | posts = etree.fromstring(r.content).find('posts').findall('post') 215 | if not posts: 216 | break 217 | 218 | for post in posts: 219 | post_info = { 220 | 'id': post.get('id'), 221 | 'gmt': datetime.strptime(post.get('date-gmt'), gmt_fmt), 222 | 'type': post_type, 223 | 'timestamp': post.get('unix-timestamp') 224 | } 225 | if post_type == 'photo': 226 | # 获取文章下所有图片链接 227 | if post.findall('photo-url'): # 标准格式 228 | photos = [] 229 | for photo_set in post.iterfind('photoset'): 230 | for photo in photo_set.iterfind('photo'): 231 | photos.append(_max_width_sub(photo, 'photo-url')) 232 | first_photo = _max_width_sub(post, 'photo-url') 233 | if first_photo not in photos: 234 | photos.append(first_photo) 235 | else: # 非标准格式,用正则 236 | photos = photo_regex.findall(''.join(post.itertext())) 237 | post_info['photos'] = list(set(photos)) 238 | yield post_info 239 | elif post_type == 'video': 240 | # 获取视频链接 241 | try: 242 | video_ext = post.find('video-source').find('extension').text 243 | except AttributeError: # 忽略非标准格式 244 | continue 245 | tree = html.fromstring(_max_width_sub(post, 'video-player')) 246 | options = json.loads(tree.get('data-crt-options')) 247 | if not options['hdUrl']: 248 | options['hdUrl'] = tree.getchildren()[0].get('src') 249 | post_info.update({'video': options['hdUrl'], 'ext': video_ext}) 250 | yield post_info 251 | 252 | 253 | def main(): 254 | global queue_down, down_stop, temp_dir 255 | # 多线程解析站点(最多3个线程) 256 | parse_thread_pool = [] 257 | for i in range(min(len(cli_args.sites), 3)): 258 | _t = Thread(target=parse_site_thread) 259 | _t.setDaemon(True) 260 | _t.start() 261 | parse_thread_pool.append(_t) 262 | 263 | # 多线程下载 264 | down_thread_pool = [] 265 | for i in range(cli_args.thread_num): 266 | _t = Thread(target=download_thread, args=(i,)) 267 | _t.setDaemon(True) 268 | _t.start() 269 | down_thread_pool.append(_t) 270 | 271 | # 等待站点解析线程结束 272 | for thread in parse_thread_pool: 273 | thread.join() 274 | # 等待下载任务队列清空 275 | while not queue_down.empty(): 276 | time.sleep(0.5) 277 | continue 278 | # 发送下载停止信号并等待下载线程结束 279 | down_stop = True 280 | for thread in down_thread_pool: 281 | thread.join() 282 | 283 | # 移除临时文件夹 284 | if isinstance(temp_dir, str) and os.path.exists(temp_dir): 285 | shutil.rmtree(temp_dir) 286 | 287 | 288 | if __name__ == '__main__': 289 | main() 290 | -------------------------------------------------------------------------------- /utils.py: -------------------------------------------------------------------------------- 1 | # coding:utf-8 2 | """ 3 | Created by tzw0745 at 18-11-5 4 | """ 5 | import re 6 | 7 | import string 8 | 9 | _formatter = string.Formatter() 10 | 11 | 12 | class SafeDict(dict): 13 | def __missing__(self, key): 14 | return '{' + key + '}' 15 | 16 | 17 | def safe_format(fmt, **kwargs): 18 | """ 19 | 安全字符串格式化,在缺少对应参数时仍能正常工作 20 | :param fmt: 格式字符串 21 | :param kwargs: 参数字典 22 | :return: 格式化后的字符串 23 | """ 24 | return _formatter.vformat(fmt, (), SafeDict(**kwargs)) 25 | 26 | 27 | def clean_fn(filename): 28 | """ 29 | 移除文件名中的特殊字符 30 | :param filename: 文件名 31 | :return: 处理后的文件名 32 | """ 33 | return re.sub(r'[\\/:*?"<>|]+', '', filename) 34 | 35 | 36 | def main(): 37 | print(safe_format('{id}', uid=111)) 38 | print(clean_fn('2018-10-1 19:00 <|>"\\/:*?".zip')) 39 | 40 | 41 | if __name__ == '__main__': 42 | main() 43 | --------------------------------------------------------------------------------