├── tumblrSpider
    ├── tumblrSpider
    │   ├── __init__.py
    │   ├── spiders
    │   │   ├── __init__.py
    │   │   └── tbr.py
    │   ├── items.py
    │   ├── settings.py
    │   ├── pipelines.py
    │   └── middlewares.py
    └── scrapy.cfg
└── README.md


/tumblrSpider/tumblrSpider/__init__.py:
--------------------------------------------------------------------------------
1 | 
2 | 


--------------------------------------------------------------------------------
/tumblrSpider/tumblrSpider/spiders/__init__.py:
--------------------------------------------------------------------------------
1 | 
2 | 


--------------------------------------------------------------------------------
/tumblrSpider/tumblrSpider/items.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | import scrapy
 4 | 
 5 | 
 6 | class TumblrspiderItem(scrapy.Item):
 7 | 
 8 |     file_url = scrapy.Field()
 9 |     file_path = scrapy.Field()
10 |     file_type = scrapy.Field()


--------------------------------------------------------------------------------
/tumblrSpider/scrapy.cfg:
--------------------------------------------------------------------------------
 1 | # Automatically created by: scrapy startproject
 2 | #
 3 | # For more information about the [deploy] section see:
 4 | # https://scrapyd.readthedocs.io/en/latest/deploy.html
 5 | 
 6 | [settings]
 7 | default = tumblrSpider.settings
 8 | 
 9 | [deploy]
10 | #url = http://localhost:6800/
11 | project = tumblrSpider
12 | 


--------------------------------------------------------------------------------
/tumblrSpider/tumblrSpider/settings.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | 
 4 | BOT_NAME = 'tumblrSpider'
 5 | 
 6 | SPIDER_MODULES = ['tumblrSpider.spiders']
 7 | NEWSPIDER_MODULE = 'tumblrSpider.spiders'
 8 | 
 9 | FILES_STORE = './data/'
10 | FILES_EXPIRES = 90	
11 | FEED_EXPORT_ENCODING = 'utf-8'
12 | 
13 | DOWNLOADER_MIDDLEWARES = {
14 |     'tumblrSpider.middlewares.LocalProxySpiderMiddleware': 543,
15 | }
16 | 
17 | ITEM_PIPELINES = {
18 |     'tumblrSpider.pipelines.TumblrspiderPipeline': 2,
19 | #    'tumblrSpider.pipelines.MyFilesPipeline': 1,
20 | }
21 | 
22 | HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
23 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # TumblrSpider
 2 | 使用scrapy编写的python爬虫，爬取汤不热上用户发布的图片与视频，下载到本地。
 3 | 
 4 | 
 5 | ### 项目结构
 6 |   * 爬虫：`tbr.py`
 7 |     1. 利用tumblr的一个接口：`https://username.tumblr.com/api/read/json?start=0&num=200` 获取用户post的内容。
 8 |     2. 获取用户post的视频或图片url。
 9 |     3. 若是reblogged的内容则将被转发的该用户加入爬取，可设置爬取深度。
10 |     
11 |   * 中间件: `middlewares.py`
12 |     1. 设置代理，因为某种原因，不能直接上tumblr，所以需要科学上网后才行，ssr开全局模式后可以无需代理直接爬，若是PAC模式则需要添加本地代理。
13 |     2. 也可以直接添加国外代理IP
14 |   
15 |   * items: `items.py`
16 |     1. 三个字段，分别为`file_url`, `file_path`和`file_type`。
17 |   
18 |   * 下载管道: `pipelines.py`
19 |     1. scrapy文件下载两种方式，用FilesPipeline或者requests。
20 |     2. `TumblrspiderPipeline` 是用文件pipeline写的pipeline。
21 |     3. `MyFilesPipeline` 是用requests方式写的pipeline。
22 |     4. 相同网络环境下，前者比后者速度快，所以使用第一种pipeline就行。
23 | 
24 | ### 项目依赖
25 |   * scrapy
26 |   * requests
27 |   * ssr(或其他科学上网工具)
28 |   
29 | ### 使用方法 
30 |   * 确保自己的电脑能够访问 https://www.tumblr.com/ 。
31 |   * `./tumblrSpider/tumblrSpider/spiders/tbr.py` 文件中，在start_urls中填入一个种子用户的主页地址。max_depth 可设置最大爬取深度。
32 |   * 在 `./tumblrSpider` 路径下， 使用命令 `scrapy crawl tbr` 
33 | 


--------------------------------------------------------------------------------
/tumblrSpider/tumblrSpider/pipelines.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | from scrapy.pipelines.files import FilesPipeline
 4 | from scrapy.http.request import Request
 5 | from scrapy.exceptions import DropItem
 6 | import requests
 7 | import os
 8 | 
 9 | class TumblrspiderPipeline(FilesPipeline):
10 |     
11 |     def file_path(self, request, response=None, info=None):
12 |         
13 |         path = request.meta['file_path']
14 |         return path
15 |     
16 |     def get_media_requests(self, item, info):
17 |         
18 |         file_url = item['file_url']
19 |         file_path = item['file_path']
20 |         yield Request(file_url, meta={'file_path': file_path})
21 |     
22 |     def item_completed(self, results, item, info):
23 |         
24 |         image_paths = [x['path'] for ok, x in results if ok]
25 |         if not image_paths:
26 |             print(results)
27 |             raise DropItem("Item contains no images")
28 |         return item
29 | 
30 | class MyFilesPipeline(object):
31 |     
32 |     def __init__(self):
33 |         
34 |         self.file_store = './MyData/'
35 |         self.proxies = {'http': '1270.0.1:1080', 'https': '127.0.0.1:1080'}
36 |         
37 |         if not os.path.exists(self.file_store):
38 |             os.mkdir(self.file_store)
39 |             
40 |     def file_request(self, url, flag=1, timeout=5):
41 |         
42 |         try:
43 |             r = requests.get(url, proxies=self.proxies, timeout=timeout)
44 |             return r
45 |         except TimeoutError:
46 |             timeout += 2
47 |             return self.file_request(url, flag, timeout)
48 |         except:
49 |             if flag <= 3:
50 |                 flag += 1
51 |                 return self.file_request(url, flag, timeout)
52 |             else:
53 |                 return None
54 |     
55 |     def process_item(self, item, spider):
56 |         
57 |         url = item['file_url']
58 |         file_dir = self.file_store + item['file_type']
59 |         path = self.file_store + item['file_path']
60 |         
61 |         if not os.path.exists(file_dir):
62 |             os.mkdir(file_dir)
63 |         if os.path.exists(path):
64 |             return item
65 |         
66 |         r = self.file_request(url)
67 |         if r:
68 |             with open(path, 'wb') as f:
69 |                 f.write(r.content)
70 |         
71 |         return item


--------------------------------------------------------------------------------
/tumblrSpider/tumblrSpider/middlewares.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | 
  3 | # Define here the models for your spider middleware
  4 | #
  5 | # See documentation in:
  6 | # https://doc.scrapy.org/en/latest/topics/spider-middleware.html
  7 | 
  8 | from scrapy import signals
  9 | 
 10 | 
 11 | class LocalProxySpiderMiddleware(object):
 12 |     
 13 |     def process_request(self, request, spider):
 14 |         
 15 |         request.meta['proxy'] = '127.0.0.1:1080'
 16 | 
 17 | 
 18 | class TumblrspiderSpiderMiddleware(object):
 19 |     # Not all methods need to be defined. If a method is not defined,
 20 |     # scrapy acts as if the spider middleware does not modify the
 21 |     # passed objects.
 22 | 
 23 |     @classmethod
 24 |     def from_crawler(cls, crawler):
 25 |         # This method is used by Scrapy to create your spiders.
 26 |         s = cls()
 27 |         crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
 28 |         return s
 29 | 
 30 |     def process_spider_input(self, response, spider):
 31 |         # Called for each response that goes through the spider
 32 |         # middleware and into the spider.
 33 | 
 34 |         # Should return None or raise an exception.
 35 |         return None
 36 | 
 37 |     def process_spider_output(self, response, result, spider):
 38 |         # Called with the results returned from the Spider, after
 39 |         # it has processed the response.
 40 | 
 41 |         # Must return an iterable of Request, dict or Item objects.
 42 |         for i in result:
 43 |             yield i
 44 | 
 45 |     def process_spider_exception(self, response, exception, spider):
 46 |         # Called when a spider or process_spider_input() method
 47 |         # (from other spider middleware) raises an exception.
 48 | 
 49 |         # Should return either None or an iterable of Response, dict
 50 |         # or Item objects.
 51 |         pass
 52 | 
 53 |     def process_start_requests(self, start_requests, spider):
 54 |         # Called with the start requests of the spider, and works
 55 |         # similarly to the process_spider_output() method, except
 56 |         # that it doesn’t have a response associated.
 57 | 
 58 |         # Must return only requests (not items).
 59 |         for r in start_requests:
 60 |             yield r
 61 | 
 62 |     def spider_opened(self, spider):
 63 |         spider.logger.info('Spider opened: %s' % spider.name)
 64 | 
 65 | 
 66 | class TumblrspiderDownloaderMiddleware(object):
 67 |     # Not all methods need to be defined. If a method is not defined,
 68 |     # scrapy acts as if the downloader middleware does not modify the
 69 |     # passed objects.
 70 | 
 71 |     @classmethod
 72 |     def from_crawler(cls, crawler):
 73 |         # This method is used by Scrapy to create your spiders.
 74 |         s = cls()
 75 |         crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
 76 |         return s
 77 | 
 78 |     def process_request(self, request, spider):
 79 |         # Called for each request that goes through the downloader
 80 |         # middleware.
 81 | 
 82 |         # Must either:
 83 |         # - return None: continue processing this request
 84 |         # - or return a Response object
 85 |         # - or return a Request object
 86 |         # - or raise IgnoreRequest: process_exception() methods of
 87 |         #   installed downloader middleware will be called
 88 |         return None
 89 | 
 90 |     def process_response(self, request, response, spider):
 91 |         # Called with the response returned from the downloader.
 92 | 
 93 |         # Must either;
 94 |         # - return a Response object
 95 |         # - return a Request object
 96 |         # - or raise IgnoreRequest
 97 |         return response
 98 | 
 99 |     def process_exception(self, request, exception, spider):
100 |         # Called when a download handler or a process_request()
101 |         # (from other downloader middleware) raises an exception.
102 | 
103 |         # Must either:
104 |         # - return None: continue processing this exception
105 |         # - return a Response object: stops process_exception() chain
106 |         # - return a Request object: stops process_exception() chain
107 |         pass
108 | 
109 |     def spider_opened(self, spider):
110 |         spider.logger.info('Spider opened: %s' % spider.name)
111 | 


--------------------------------------------------------------------------------
/tumblrSpider/tumblrSpider/spiders/tbr.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | import scrapy
  3 | from scrapy.http.request import Request
  4 | from tumblrSpider.items import TumblrspiderItem
  5 | import re
  6 | import json
  7 | 
  8 | 
  9 | class TbrSpider(scrapy.Spider):
 10 |     name = 'tbr'
 11 |     allowed_domains = ['tumblr.com']
 12 |     start_urls = ['https://balabala.tumblr.com/']
 13 |     max_depth = 4
 14 |     
 15 |     meta = {
 16 |             'dont_redirect': True,
 17 |             'handle_httpstatus_list': [301, 302]
 18 |             }
 19 |     
 20 |     
 21 |     def start_requests(self):
 22 |         
 23 |         for url in self.start_urls:
 24 |             user_name = re.findall(r'://([^\.]*)\.tumblr\.com', url)[0]
 25 |             print(user_name)
 26 |             url = '''https://{}.tumblr.com/api/read/json?start=0&num=200'''.format(user_name)
 27 |             yield Request(url, headers=self.get_headers(), meta={'depth': 0})
 28 |             
 29 |     def parse(self, response):
 30 |         
 31 |         data = response.text[22:-2]
 32 |         data = json.loads(data)
 33 |         
 34 |         posts = data['posts']
 35 |         for post in posts:
 36 |             if post['type'] == 'regular':
 37 |                 regular_body = post['regular-body']
 38 |                 try:
 39 |                     video_id = re.findall(r'/(tumblr_[^_]*)_[^\.]*?\.jpg', regular_body)[0]
 40 |                     video_id = video_id.split('.')[0]
 41 |                     video_url = 'https://ve.media.tumblr.com/{}.mp4'.format(video_id)
 42 |                     video_name = video_url.split('/')[-1]
 43 |                     video_path = post['type'] + '/' + video_name
 44 | 
 45 | 
 46 |                     item = TumblrspiderItem()
 47 |                     item['file_url'] = video_url
 48 |                     item['file_path'] = video_path
 49 |                     item['file_type'] = post['type']
 50 |                     yield item
 51 | 
 52 |                 except IndexError:
 53 |                     print(regular_body)
 54 | 
 55 |             if post['type'] == 'video':
 56 |                 video_player = post['video-player']
 57 |                 try:
 58 |                     video_id = re.findall(r'/(tumblr_[^_]*)_[^\.]*?\.jpg', video_player)[0]
 59 |                     video_url = 'https://vtt.tumblr.com/{}_480.mp4'.format(video_id)
 60 |                     video_name = video_url.split('/')[-1]
 61 |                     video_path = post['type'] + '/' + video_name
 62 |                     
 63 |                     item = TumblrspiderItem()
 64 |                     item['file_url'] = video_url
 65 |                     item['file_path'] = video_path
 66 |                     item['file_type'] = post['type']
 67 |                     yield item
 68 |                 
 69 |                 except IndexError:
 70 |                     print(video_player)
 71 |                     
 72 |             elif post['type'] == 'photo':
 73 |                 photo_url = post['photo-url-1280']
 74 |                 photo_name = photo_url.split('/')[-1]
 75 |                 photo_path = post['type'] + '/' + photo_name
 76 |                 
 77 |                 item = TumblrspiderItem()
 78 |                 item['file_url'] = photo_url
 79 |                 item['file_path'] = photo_path
 80 |                 item['file_type'] = post['type']
 81 |                 yield item
 82 |                 
 83 |             else:
 84 |                 print(post['type'])
 85 |                 
 86 |                 
 87 |             try:
 88 |                 reblogged_url = post['reblogged-from-url']
 89 |             except KeyError:
 90 |                 continue
 91 |             try:
 92 |                 user_name = re.findall(r'://([^\.]*)\.tumblr\.com', reblogged_url)[0]
 93 |             except IndexError:
 94 |                 continue
 95 |             print(user_name)
 96 |             url = '''https://{}.tumblr.com/api/read/json?start=0&num=200'''.format(user_name)
 97 |             
 98 |             depth = response.meta['depth'] + 1
 99 |             
100 |             if depth <= self.max_depth:
101 |                 
102 |                 yield Request(url, headers=self.get_headers(), 
103 |                               callback=self.parse, meta={'depth': depth})
104 |                     
105 |     def get_headers(self):
106 |         
107 |         headers = {':authority': 'mypussynet.tumblr.com',
108 |                    ':scheme': 'https', 
109 |                    ':method': 'GET',
110 |                    ':path': '/',
111 |                    'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8', 
112 |                    'accept-encoding': 'gzip,deflate,br', 
113 |                    'accept-language': 'zh-CN,zh;q=0.9', 
114 |                    'cache-control': 'no-cache',  
115 |                    'pragma': 'no-cache', 
116 |                    'upgrade-insecure-requests': '1',
117 |                    'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Safari/537.36'
118 |                     }
119 |         return headers
120 | 


--------------------------------------------------------------------------------