├── tumblrSpider ├── tumblrSpider │ ├── __init__.py │ ├── spiders │ │ ├── __init__.py │ │ └── tbr.py │ ├── items.py │ ├── settings.py │ ├── pipelines.py │ └── middlewares.py └── scrapy.cfg └── README.md /tumblrSpider/tumblrSpider/__init__.py: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /tumblrSpider/tumblrSpider/spiders/__init__.py: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /tumblrSpider/tumblrSpider/items.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | import scrapy 4 | 5 | 6 | class TumblrspiderItem(scrapy.Item): 7 | 8 | file_url = scrapy.Field() 9 | file_path = scrapy.Field() 10 | file_type = scrapy.Field() -------------------------------------------------------------------------------- /tumblrSpider/scrapy.cfg: -------------------------------------------------------------------------------- 1 | # Automatically created by: scrapy startproject 2 | # 3 | # For more information about the [deploy] section see: 4 | # https://scrapyd.readthedocs.io/en/latest/deploy.html 5 | 6 | [settings] 7 | default = tumblrSpider.settings 8 | 9 | [deploy] 10 | #url = http://localhost:6800/ 11 | project = tumblrSpider 12 | -------------------------------------------------------------------------------- /tumblrSpider/tumblrSpider/settings.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | 4 | BOT_NAME = 'tumblrSpider' 5 | 6 | SPIDER_MODULES = ['tumblrSpider.spiders'] 7 | NEWSPIDER_MODULE = 'tumblrSpider.spiders' 8 | 9 | FILES_STORE = './data/' 10 | FILES_EXPIRES = 90 11 | FEED_EXPORT_ENCODING = 'utf-8' 12 | 13 | DOWNLOADER_MIDDLEWARES = { 14 | 'tumblrSpider.middlewares.LocalProxySpiderMiddleware': 543, 15 | } 16 | 17 | ITEM_PIPELINES = { 18 | 'tumblrSpider.pipelines.TumblrspiderPipeline': 2, 19 | # 'tumblrSpider.pipelines.MyFilesPipeline': 1, 20 | } 21 | 22 | HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage' 23 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # TumblrSpider 2 | 使用scrapy编写的python爬虫,爬取汤不热上用户发布的图片与视频,下载到本地。 3 | 4 | 5 | ### 项目结构 6 | * 爬虫:`tbr.py` 7 | 1. 利用tumblr的一个接口:`https://username.tumblr.com/api/read/json?start=0&num=200` 获取用户post的内容。 8 | 2. 获取用户post的视频或图片url。 9 | 3. 若是reblogged的内容则将被转发的该用户加入爬取,可设置爬取深度。 10 | 11 | * 中间件: `middlewares.py` 12 | 1. 设置代理,因为某种原因,不能直接上tumblr,所以需要科学上网后才行,ssr开全局模式后可以无需代理直接爬,若是PAC模式则需要添加本地代理。 13 | 2. 也可以直接添加国外代理IP 14 | 15 | * items: `items.py` 16 | 1. 三个字段,分别为`file_url`, `file_path`和`file_type`。 17 | 18 | * 下载管道: `pipelines.py` 19 | 1. scrapy文件下载两种方式,用FilesPipeline或者requests。 20 | 2. `TumblrspiderPipeline` 是用文件pipeline写的pipeline。 21 | 3. `MyFilesPipeline` 是用requests方式写的pipeline。 22 | 4. 相同网络环境下,前者比后者速度快,所以使用第一种pipeline就行。 23 | 24 | ### 项目依赖 25 | * scrapy 26 | * requests 27 | * ssr(或其他科学上网工具) 28 | 29 | ### 使用方法 30 | * 确保自己的电脑能够访问 https://www.tumblr.com/ 。 31 | * `./tumblrSpider/tumblrSpider/spiders/tbr.py` 文件中,在start_urls中填入一个种子用户的主页地址。max_depth 可设置最大爬取深度。 32 | * 在 `./tumblrSpider` 路径下, 使用命令 `scrapy crawl tbr` 33 | -------------------------------------------------------------------------------- /tumblrSpider/tumblrSpider/pipelines.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | from scrapy.pipelines.files import FilesPipeline 4 | from scrapy.http.request import Request 5 | from scrapy.exceptions import DropItem 6 | import requests 7 | import os 8 | 9 | class TumblrspiderPipeline(FilesPipeline): 10 | 11 | def file_path(self, request, response=None, info=None): 12 | 13 | path = request.meta['file_path'] 14 | return path 15 | 16 | def get_media_requests(self, item, info): 17 | 18 | file_url = item['file_url'] 19 | file_path = item['file_path'] 20 | yield Request(file_url, meta={'file_path': file_path}) 21 | 22 | def item_completed(self, results, item, info): 23 | 24 | image_paths = [x['path'] for ok, x in results if ok] 25 | if not image_paths: 26 | print(results) 27 | raise DropItem("Item contains no images") 28 | return item 29 | 30 | class MyFilesPipeline(object): 31 | 32 | def __init__(self): 33 | 34 | self.file_store = './MyData/' 35 | self.proxies = {'http': '1270.0.1:1080', 'https': '127.0.0.1:1080'} 36 | 37 | if not os.path.exists(self.file_store): 38 | os.mkdir(self.file_store) 39 | 40 | def file_request(self, url, flag=1, timeout=5): 41 | 42 | try: 43 | r = requests.get(url, proxies=self.proxies, timeout=timeout) 44 | return r 45 | except TimeoutError: 46 | timeout += 2 47 | return self.file_request(url, flag, timeout) 48 | except: 49 | if flag <= 3: 50 | flag += 1 51 | return self.file_request(url, flag, timeout) 52 | else: 53 | return None 54 | 55 | def process_item(self, item, spider): 56 | 57 | url = item['file_url'] 58 | file_dir = self.file_store + item['file_type'] 59 | path = self.file_store + item['file_path'] 60 | 61 | if not os.path.exists(file_dir): 62 | os.mkdir(file_dir) 63 | if os.path.exists(path): 64 | return item 65 | 66 | r = self.file_request(url) 67 | if r: 68 | with open(path, 'wb') as f: 69 | f.write(r.content) 70 | 71 | return item -------------------------------------------------------------------------------- /tumblrSpider/tumblrSpider/middlewares.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define here the models for your spider middleware 4 | # 5 | # See documentation in: 6 | # https://doc.scrapy.org/en/latest/topics/spider-middleware.html 7 | 8 | from scrapy import signals 9 | 10 | 11 | class LocalProxySpiderMiddleware(object): 12 | 13 | def process_request(self, request, spider): 14 | 15 | request.meta['proxy'] = '127.0.0.1:1080' 16 | 17 | 18 | class TumblrspiderSpiderMiddleware(object): 19 | # Not all methods need to be defined. If a method is not defined, 20 | # scrapy acts as if the spider middleware does not modify the 21 | # passed objects. 22 | 23 | @classmethod 24 | def from_crawler(cls, crawler): 25 | # This method is used by Scrapy to create your spiders. 26 | s = cls() 27 | crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) 28 | return s 29 | 30 | def process_spider_input(self, response, spider): 31 | # Called for each response that goes through the spider 32 | # middleware and into the spider. 33 | 34 | # Should return None or raise an exception. 35 | return None 36 | 37 | def process_spider_output(self, response, result, spider): 38 | # Called with the results returned from the Spider, after 39 | # it has processed the response. 40 | 41 | # Must return an iterable of Request, dict or Item objects. 42 | for i in result: 43 | yield i 44 | 45 | def process_spider_exception(self, response, exception, spider): 46 | # Called when a spider or process_spider_input() method 47 | # (from other spider middleware) raises an exception. 48 | 49 | # Should return either None or an iterable of Response, dict 50 | # or Item objects. 51 | pass 52 | 53 | def process_start_requests(self, start_requests, spider): 54 | # Called with the start requests of the spider, and works 55 | # similarly to the process_spider_output() method, except 56 | # that it doesn’t have a response associated. 57 | 58 | # Must return only requests (not items). 59 | for r in start_requests: 60 | yield r 61 | 62 | def spider_opened(self, spider): 63 | spider.logger.info('Spider opened: %s' % spider.name) 64 | 65 | 66 | class TumblrspiderDownloaderMiddleware(object): 67 | # Not all methods need to be defined. If a method is not defined, 68 | # scrapy acts as if the downloader middleware does not modify the 69 | # passed objects. 70 | 71 | @classmethod 72 | def from_crawler(cls, crawler): 73 | # This method is used by Scrapy to create your spiders. 74 | s = cls() 75 | crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) 76 | return s 77 | 78 | def process_request(self, request, spider): 79 | # Called for each request that goes through the downloader 80 | # middleware. 81 | 82 | # Must either: 83 | # - return None: continue processing this request 84 | # - or return a Response object 85 | # - or return a Request object 86 | # - or raise IgnoreRequest: process_exception() methods of 87 | # installed downloader middleware will be called 88 | return None 89 | 90 | def process_response(self, request, response, spider): 91 | # Called with the response returned from the downloader. 92 | 93 | # Must either; 94 | # - return a Response object 95 | # - return a Request object 96 | # - or raise IgnoreRequest 97 | return response 98 | 99 | def process_exception(self, request, exception, spider): 100 | # Called when a download handler or a process_request() 101 | # (from other downloader middleware) raises an exception. 102 | 103 | # Must either: 104 | # - return None: continue processing this exception 105 | # - return a Response object: stops process_exception() chain 106 | # - return a Request object: stops process_exception() chain 107 | pass 108 | 109 | def spider_opened(self, spider): 110 | spider.logger.info('Spider opened: %s' % spider.name) 111 | -------------------------------------------------------------------------------- /tumblrSpider/tumblrSpider/spiders/tbr.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import scrapy 3 | from scrapy.http.request import Request 4 | from tumblrSpider.items import TumblrspiderItem 5 | import re 6 | import json 7 | 8 | 9 | class TbrSpider(scrapy.Spider): 10 | name = 'tbr' 11 | allowed_domains = ['tumblr.com'] 12 | start_urls = ['https://balabala.tumblr.com/'] 13 | max_depth = 4 14 | 15 | meta = { 16 | 'dont_redirect': True, 17 | 'handle_httpstatus_list': [301, 302] 18 | } 19 | 20 | 21 | def start_requests(self): 22 | 23 | for url in self.start_urls: 24 | user_name = re.findall(r'://([^\.]*)\.tumblr\.com', url)[0] 25 | print(user_name) 26 | url = '''https://{}.tumblr.com/api/read/json?start=0&num=200'''.format(user_name) 27 | yield Request(url, headers=self.get_headers(), meta={'depth': 0}) 28 | 29 | def parse(self, response): 30 | 31 | data = response.text[22:-2] 32 | data = json.loads(data) 33 | 34 | posts = data['posts'] 35 | for post in posts: 36 | if post['type'] == 'regular': 37 | regular_body = post['regular-body'] 38 | try: 39 | video_id = re.findall(r'/(tumblr_[^_]*)_[^\.]*?\.jpg', regular_body)[0] 40 | video_id = video_id.split('.')[0] 41 | video_url = 'https://ve.media.tumblr.com/{}.mp4'.format(video_id) 42 | video_name = video_url.split('/')[-1] 43 | video_path = post['type'] + '/' + video_name 44 | 45 | 46 | item = TumblrspiderItem() 47 | item['file_url'] = video_url 48 | item['file_path'] = video_path 49 | item['file_type'] = post['type'] 50 | yield item 51 | 52 | except IndexError: 53 | print(regular_body) 54 | 55 | if post['type'] == 'video': 56 | video_player = post['video-player'] 57 | try: 58 | video_id = re.findall(r'/(tumblr_[^_]*)_[^\.]*?\.jpg', video_player)[0] 59 | video_url = 'https://vtt.tumblr.com/{}_480.mp4'.format(video_id) 60 | video_name = video_url.split('/')[-1] 61 | video_path = post['type'] + '/' + video_name 62 | 63 | item = TumblrspiderItem() 64 | item['file_url'] = video_url 65 | item['file_path'] = video_path 66 | item['file_type'] = post['type'] 67 | yield item 68 | 69 | except IndexError: 70 | print(video_player) 71 | 72 | elif post['type'] == 'photo': 73 | photo_url = post['photo-url-1280'] 74 | photo_name = photo_url.split('/')[-1] 75 | photo_path = post['type'] + '/' + photo_name 76 | 77 | item = TumblrspiderItem() 78 | item['file_url'] = photo_url 79 | item['file_path'] = photo_path 80 | item['file_type'] = post['type'] 81 | yield item 82 | 83 | else: 84 | print(post['type']) 85 | 86 | 87 | try: 88 | reblogged_url = post['reblogged-from-url'] 89 | except KeyError: 90 | continue 91 | try: 92 | user_name = re.findall(r'://([^\.]*)\.tumblr\.com', reblogged_url)[0] 93 | except IndexError: 94 | continue 95 | print(user_name) 96 | url = '''https://{}.tumblr.com/api/read/json?start=0&num=200'''.format(user_name) 97 | 98 | depth = response.meta['depth'] + 1 99 | 100 | if depth <= self.max_depth: 101 | 102 | yield Request(url, headers=self.get_headers(), 103 | callback=self.parse, meta={'depth': depth}) 104 | 105 | def get_headers(self): 106 | 107 | headers = {':authority': 'mypussynet.tumblr.com', 108 | ':scheme': 'https', 109 | ':method': 'GET', 110 | ':path': '/', 111 | 'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8', 112 | 'accept-encoding': 'gzip,deflate,br', 113 | 'accept-language': 'zh-CN,zh;q=0.9', 114 | 'cache-control': 'no-cache', 115 | 'pragma': 'no-cache', 116 | 'upgrade-insecure-requests': '1', 117 | 'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Safari/537.36' 118 | } 119 | return headers 120 | --------------------------------------------------------------------------------