├── .gitignore
├── README.md
├── scrapy.cfg
└── weibo
    ├── __init__.py
    ├── items.py
    ├── middlewares.py
    ├── pipelines.py
    ├── settings.py
    └── spiders
        ├── __init__.py
        └── weibocn.py


/.gitignore:
--------------------------------------------------------------------------------
1 | /.idea
2 | *.pyc
3 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # Weibo
2 | 
3 | Weibo Spider Using Scrapy


--------------------------------------------------------------------------------
/scrapy.cfg:
--------------------------------------------------------------------------------
 1 | # Automatically created by: scrapy startproject
 2 | #
 3 | # For more information about the [deploy] section see:
 4 | # https://scrapyd.readthedocs.org/en/latest/deploy.html
 5 | 
 6 | [settings]
 7 | default = weibo.settings
 8 | 
 9 | [deploy]
10 | #url = http://localhost:6800/
11 | project = weibo
12 | 


--------------------------------------------------------------------------------
/weibo/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Python3WebSpider/Weibo/bbc33e4907ba253960b5bff3fa9c16c4ca84a4a6/weibo/__init__.py


--------------------------------------------------------------------------------
/weibo/items.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Define here the models for your scraped items
 4 | #
 5 | # See documentation in:
 6 | # http://doc.scrapy.org/en/latest/topics/items.html
 7 | 
 8 | from scrapy import Item, Field
 9 | 
10 | 
11 | class UserItem(Item):
12 |     collection = 'users'
13 |     
14 |     id = Field()
15 |     name = Field()
16 |     avatar = Field()
17 |     cover = Field()
18 |     gender = Field()
19 |     description = Field()
20 |     fans_count = Field()
21 |     follows_count = Field()
22 |     weibos_count = Field()
23 |     verified = Field()
24 |     verified_reason = Field()
25 |     verified_type = Field()
26 |     follows = Field()
27 |     fans = Field()
28 |     crawled_at = Field()
29 | 
30 | 
31 | class UserRelationItem(Item):
32 |     collection = 'users'
33 |     
34 |     id = Field()
35 |     follows = Field()
36 |     fans = Field()
37 | 
38 | 
39 | class WeiboItem(Item):
40 |     collection = 'weibos'
41 |     
42 |     id = Field()
43 |     attitudes_count = Field()
44 |     comments_count = Field()
45 |     reposts_count = Field()
46 |     picture = Field()
47 |     pictures = Field()
48 |     source = Field()
49 |     text = Field()
50 |     raw_text = Field()
51 |     thumbnail = Field()
52 |     user = Field()
53 |     created_at = Field()
54 |     crawled_at = Field()
55 | 
56 | 
57 | 


--------------------------------------------------------------------------------
/weibo/middlewares.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Define here the models for your spider middleware
 4 | #
 5 | # See documentation in:
 6 | # http://doc.scrapy.org/en/latest/topics/spider-middleware.html
 7 | import json
 8 | import logging
 9 | from scrapy import signals
10 | import requests
11 | 
12 | 
13 | class ProxyMiddleware():
14 |     def __init__(self, proxy_url):
15 |         self.logger = logging.getLogger(__name__)
16 |         self.proxy_url = proxy_url
17 |     
18 |     def get_random_proxy(self):
19 |         try:
20 |             response = requests.get(self.proxy_url)
21 |             if response.status_code == 200:
22 |                 proxy = response.text
23 |                 return proxy
24 |         except requests.ConnectionError:
25 |             return False
26 |     
27 |     def process_request(self, request, spider):
28 |         if request.meta.get('retry_times'):
29 |             proxy = self.get_random_proxy()
30 |             if proxy:
31 |                 uri = 'https://{proxy}'.format(proxy=proxy)
32 |                 self.logger.debug('使用代理 ' + proxy)
33 |                 request.meta['proxy'] = uri
34 | 
35 |     @classmethod
36 |     def from_crawler(cls, crawler):
37 |         settings = crawler.settings
38 |         return cls(
39 |             proxy_url=settings.get('PROXY_URL')
40 |         )
41 | 
42 | 
43 | class CookiesMiddleware():
44 |     def __init__(self, cookies_url):
45 |         self.logger = logging.getLogger(__name__)
46 |         self.cookies_url = cookies_url
47 |     
48 |     def get_random_cookies(self):
49 |         try:
50 |             response = requests.get(self.cookies_url)
51 |             if response.status_code == 200:
52 |                 cookies = json.loads(response.text)
53 |                 return cookies
54 |         except requests.ConnectionError:
55 |             return False
56 |     
57 |     def process_request(self, request, spider):
58 |         self.logger.debug('正在获取Cookies')
59 |         cookies = self.get_random_cookies()
60 |         if cookies:
61 |             request.cookies = cookies
62 |             self.logger.debug('使用Cookies ' + json.dumps(cookies))
63 |     
64 |     @classmethod
65 |     def from_crawler(cls, crawler):
66 |         settings = crawler.settings
67 |         return cls(
68 |             cookies_url=settings.get('COOKIES_URL')
69 |         )
70 | 


--------------------------------------------------------------------------------
/weibo/pipelines.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Define your item pipelines here
 4 | #
 5 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting
 6 | # See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
 7 | import re, time
 8 | 
 9 | import logging
10 | import pymongo
11 | 
12 | from weibo.items import *
13 | 
14 | 
15 | class TimePipeline():
16 |     def process_item(self, item, spider):
17 |         if isinstance(item, UserItem) or isinstance(item, WeiboItem):
18 |             now = time.strftime('%Y-%m-%d %H:%M', time.localtime())
19 |             item['crawled_at'] = now
20 |         return item
21 | 
22 | 
23 | class WeiboPipeline():
24 |     def parse_time(self, date):
25 |         if re.match('刚刚', date):
26 |             date = time.strftime('%Y-%m-%d %H:%M', time.localtime(time.time()))
27 |         if re.match('\d+分钟前', date):
28 |             minute = re.match('(\d+)', date).group(1)
29 |             date = time.strftime('%Y-%m-%d %H:%M', time.localtime(time.time() - float(minute) * 60))
30 |         if re.match('\d+小时前', date):
31 |             hour = re.match('(\d+)', date).group(1)
32 |             date = time.strftime('%Y-%m-%d %H:%M', time.localtime(time.time() - float(hour) * 60 * 60))
33 |         if re.match('昨天.*', date):
34 |             date = re.match('昨天(.*)', date).group(1).strip()
35 |             date = time.strftime('%Y-%m-%d', time.localtime() - 24 * 60 * 60) + ' ' + date
36 |         if re.match('\d{2}-\d{2}', date):
37 |             date = time.strftime('%Y-', time.localtime()) + date + ' 00:00'
38 |         return date
39 |     
40 |     def process_item(self, item, spider):
41 |         if isinstance(item, WeiboItem):
42 |             if item.get('created_at'):
43 |                 item['created_at'] = item['created_at'].strip()
44 |                 item['created_at'] = self.parse_time(item.get('created_at'))
45 |             if item.get('pictures'):
46 |                 item['pictures'] = [pic.get('url') for pic in item.get('pictures')]
47 |         return item
48 | 
49 | 
50 | class MongoPipeline(object):
51 |     def __init__(self, mongo_uri, mongo_db):
52 |         self.mongo_uri = mongo_uri
53 |         self.mongo_db = mongo_db
54 |     
55 |     @classmethod
56 |     def from_crawler(cls, crawler):
57 |         return cls(
58 |             mongo_uri=crawler.settings.get('MONGO_URI'),
59 |             mongo_db=crawler.settings.get('MONGO_DATABASE')
60 |         )
61 |     
62 |     def open_spider(self, spider):
63 |         self.client = pymongo.MongoClient(self.mongo_uri)
64 |         self.db = self.client[self.mongo_db]
65 |         self.db[UserItem.collection].create_index([('id', pymongo.ASCENDING)])
66 |         self.db[WeiboItem.collection].create_index([('id', pymongo.ASCENDING)])
67 |     
68 |     def close_spider(self, spider):
69 |         self.client.close()
70 |     
71 |     def process_item(self, item, spider):
72 |         if isinstance(item, UserItem) or isinstance(item, WeiboItem):
73 |             self.db[item.collection].update({'id': item.get('id')}, {'$set': item}, True)
74 |         if isinstance(item, UserRelationItem):
75 |             self.db[item.collection].update(
76 |                 {'id': item.get('id')},
77 |                 {'$addToSet':
78 |                     {
79 |                         'follows': {'$each': item['follows']},
80 |                         'fans': {'$each': item['fans']}
81 |                     }
82 |                 }, True)
83 |         return item
84 | 


--------------------------------------------------------------------------------
/weibo/settings.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | 
  3 | # Scrapy settings for weibo project
  4 | #
  5 | # For simplicity, this file contains only settings considered important or
  6 | # commonly used. You can find more settings consulting the documentation:
  7 | #
  8 | #     http://doc.scrapy.org/en/latest/topics/settings.html
  9 | #     http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
 10 | #     http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
 11 | 
 12 | BOT_NAME = 'weibo'
 13 | 
 14 | SPIDER_MODULES = ['weibo.spiders']
 15 | NEWSPIDER_MODULE = 'weibo.spiders'
 16 | 
 17 | # Crawl responsibly by identifying yourself (and your website) on the user-agent
 18 | # USER_AGENT = 'weibo (+http://www.yourdomain.com)'
 19 | 
 20 | # Obey robots.txt rules
 21 | ROBOTSTXT_OBEY = False
 22 | 
 23 | DEFAULT_REQUEST_HEADERS = {
 24 |     'Accept': 'application/json, text/plain, */*',
 25 |     'Accept-Encoding': 'gzip, deflate, sdch',
 26 |     'Accept-Language': 'zh-CN,zh;q=0.8,en;q=0.6,ja;q=0.4,zh-TW;q=0.2,mt;q=0.2',
 27 |     'Connection': 'keep-alive',
 28 |     'Host': 'm.weibo.cn',
 29 |     'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.115 Safari/537.36',
 30 |     'X-Requested-With': 'XMLHttpRequest',
 31 | }
 32 | 
 33 | # Configure maximum concurrent requests performed by Scrapy (default: 16)
 34 | # CONCURRENT_REQUESTS = 32
 35 | 
 36 | # Configure a delay for requests for the same website (default: 0)
 37 | # See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay
 38 | # See also autothrottle settings and docs
 39 | # DOWNLOAD_DELAY = 3
 40 | # The download delay setting will honor only one of:
 41 | # CONCURRENT_REQUESTS_PER_DOMAIN = 16
 42 | # CONCURRENT_REQUESTS_PER_IP = 16
 43 | 
 44 | # Disable cookies (enabled by default)
 45 | # COOKIES_ENABLED = False
 46 | 
 47 | # Disable Telnet Console (enabled by default)
 48 | # TELNETCONSOLE_ENABLED = False
 49 | 
 50 | # Override the default request headers:
 51 | # DEFAULT_REQUEST_HEADERS = {
 52 | #   'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
 53 | #   'Accept-Language': 'en',
 54 | # }
 55 | 
 56 | # Enable or disable spider middlewares
 57 | # See http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
 58 | # SPIDER_MIDDLEWARES = {
 59 | #    'weibo.middlewares.WeiboSpiderMiddleware': 543,
 60 | # }
 61 | 
 62 | # Enable or disable downloader middlewares
 63 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
 64 | DOWNLOADER_MIDDLEWARES = {
 65 |     'weibo.middlewares.CookiesMiddleware': 554,
 66 |     'weibo.middlewares.ProxyMiddleware': 555,
 67 | }
 68 | 
 69 | # Enable or disable extensions
 70 | # See http://scrapy.readthedocs.org/en/latest/topics/extensions.html
 71 | # EXTENSIONS = {
 72 | #    'scrapy.extensions.telnet.TelnetConsole': None,
 73 | # }
 74 | 
 75 | # Configure item pipelines
 76 | # See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html
 77 | ITEM_PIPELINES = {
 78 |     'weibo.pipelines.TimePipeline': 300,
 79 |     'weibo.pipelines.WeiboPipeline': 301,
 80 |     'weibo.pipelines.MongoPipeline': 302,
 81 | }
 82 | 
 83 | # Enable and configure the AutoThrottle extension (disabled by default)
 84 | # See http://doc.scrapy.org/en/latest/topics/autothrottle.html
 85 | # AUTOTHROTTLE_ENABLED = True
 86 | # The initial download delay
 87 | # AUTOTHROTTLE_START_DELAY = 5
 88 | # The maximum download delay to be set in case of high latencies
 89 | # AUTOTHROTTLE_MAX_DELAY = 60
 90 | # The average number of requests Scrapy should be sending in parallel to
 91 | # each remote server
 92 | # AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
 93 | # Enable showing throttling stats for every response received:
 94 | # AUTOTHROTTLE_DEBUG = False
 95 | 
 96 | # Enable and configure HTTP caching (disabled by default)
 97 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
 98 | # HTTPCACHE_ENABLED = True
 99 | # HTTPCACHE_EXPIRATION_SECS = 0
100 | # HTTPCACHE_DIR = 'httpcache'
101 | # HTTPCACHE_IGNORE_HTTP_CODES = []
102 | # HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
103 | 
104 | 
105 | MONGO_URI = 'localhost'
106 | 
107 | MONGO_DATABASE = 'weibo'
108 | 
109 | COOKIES_URL = 'http://localhost:5000/weibo/random'
110 | 
111 | PROXY_URL = 'http://localhost:5555/random'
112 | 
113 | RETRY_HTTP_CODES = [401, 403, 408, 414, 500, 502, 503, 504]
114 | 


--------------------------------------------------------------------------------
/weibo/spiders/__init__.py:
--------------------------------------------------------------------------------
1 | # This package will contain the spiders of your Scrapy project
2 | #
3 | # Please refer to the documentation for information on how to create and manage
4 | # your spiders.
5 | 


--------------------------------------------------------------------------------
/weibo/spiders/weibocn.py:
--------------------------------------------------------------------------------
  1 | import json
  2 | from scrapy import Request, Spider
  3 | from weibo.items import *
  4 | 
  5 | 
  6 | class WeiboSpider(Spider):
  7 |     name = 'weibocn'
  8 |     
  9 |     allowed_domains = ['m.weibo.cn']
 10 |     
 11 |     user_url = 'https://m.weibo.cn/api/container/getIndex?uid={uid}&type=uid&value={uid}&containerid=100505{uid}'
 12 |     
 13 |     follow_url = 'https://m.weibo.cn/api/container/getIndex?containerid=231051_-_followers_-_{uid}&page={page}'
 14 |     
 15 |     fan_url = 'https://m.weibo.cn/api/container/getIndex?containerid=231051_-_fans_-_{uid}&page={page}'
 16 |     
 17 |     weibo_url = 'https://m.weibo.cn/api/container/getIndex?uid={uid}&type=uid&page={page}&containerid=107603{uid}'
 18 |     
 19 |     start_users = ['3217179555', '1742566624', '2282991915', '1288739185', '3952070245', '5878659096']
 20 |     
 21 |     def start_requests(self):
 22 |         for uid in self.start_users:
 23 |             yield Request(self.user_url.format(uid=uid), callback=self.parse_user)
 24 |     
 25 |     def parse_user(self, response):
 26 |         """
 27 |         解析用户信息
 28 |         :param response: Response对象
 29 |         """
 30 |         self.logger.debug(response)
 31 |         result = json.loads(response.text)
 32 |         if result.get('data').get('userInfo'):
 33 |             user_info = result.get('data').get('userInfo')
 34 |             user_item = UserItem()
 35 |             field_map = {
 36 |                 'id': 'id', 'name': 'screen_name', 'avatar': 'profile_image_url', 'cover': 'cover_image_phone',
 37 |                 'gender': 'gender', 'description': 'description', 'fans_count': 'followers_count',
 38 |                 'follows_count': 'follow_count', 'weibos_count': 'statuses_count', 'verified': 'verified',
 39 |                 'verified_reason': 'verified_reason', 'verified_type': 'verified_type'
 40 |             }
 41 |             for field, attr in field_map.items():
 42 |                 user_item[field] = user_info.get(attr)
 43 |             yield user_item
 44 |             # 关注
 45 |             uid = user_info.get('id')
 46 |             yield Request(self.follow_url.format(uid=uid, page=1), callback=self.parse_follows,
 47 |                           meta={'page': 1, 'uid': uid})
 48 |             # 粉丝
 49 |             yield Request(self.fan_url.format(uid=uid, page=1), callback=self.parse_fans,
 50 |                           meta={'page': 1, 'uid': uid})
 51 |             # 微博
 52 |             yield Request(self.weibo_url.format(uid=uid, page=1), callback=self.parse_weibos,
 53 |                           meta={'page': 1, 'uid': uid})
 54 |     
 55 |     def parse_follows(self, response):
 56 |         """
 57 |         解析用户关注
 58 |         :param response: Response对象
 59 |         """
 60 |         result = json.loads(response.text)
 61 |         if result.get('ok') and result.get('data').get('cards') and len(result.get('data').get('cards')) and result.get('data').get('cards')[-1].get(
 62 |             'card_group'):
 63 |             # 解析用户
 64 |             follows = result.get('data').get('cards')[-1].get('card_group')
 65 |             for follow in follows:
 66 |                 if follow.get('user'):
 67 |                     uid = follow.get('user').get('id')
 68 |                     yield Request(self.user_url.format(uid=uid), callback=self.parse_user)
 69 |             
 70 |             uid = response.meta.get('uid')
 71 |             # 关注列表
 72 |             user_relation_item = UserRelationItem()
 73 |             follows = [{'id': follow.get('user').get('id'), 'name': follow.get('user').get('screen_name')} for follow in
 74 |                        follows]
 75 |             user_relation_item['id'] = uid
 76 |             user_relation_item['follows'] = follows
 77 |             user_relation_item['fans'] = []
 78 |             yield user_relation_item
 79 |             # 下一页关注
 80 |             page = response.meta.get('page') + 1
 81 |             yield Request(self.follow_url.format(uid=uid, page=page),
 82 |                           callback=self.parse_follows, meta={'page': page, 'uid': uid})
 83 |     
 84 |     def parse_fans(self, response):
 85 |         """
 86 |         解析用户粉丝
 87 |         :param response: Response对象
 88 |         """
 89 |         result = json.loads(response.text)
 90 |         if result.get('ok') and result.get('data').get('cards') and len(result.get('data').get('cards')) and result.get('data').get('cards')[-1].get(
 91 |             'card_group'):
 92 |             # 解析用户
 93 |             fans = result.get('data').get('cards')[-1].get('card_group')
 94 |             for fan in fans:
 95 |                 if fan.get('user'):
 96 |                     uid = fan.get('user').get('id')
 97 |                     yield Request(self.user_url.format(uid=uid), callback=self.parse_user)
 98 |             
 99 |             uid = response.meta.get('uid')
100 |             # 粉丝列表
101 |             user_relation_item = UserRelationItem()
102 |             fans = [{'id': fan.get('user').get('id'), 'name': fan.get('user').get('screen_name')} for fan in
103 |                     fans]
104 |             user_relation_item['id'] = uid
105 |             user_relation_item['fans'] = fans
106 |             user_relation_item['follows'] = []
107 |             yield user_relation_item
108 |             # 下一页粉丝
109 |             page = response.meta.get('page') + 1
110 |             yield Request(self.fan_url.format(uid=uid, page=page),
111 |                           callback=self.parse_fans, meta={'page': page, 'uid': uid})
112 |     
113 |     def parse_weibos(self, response):
114 |         """
115 |         解析微博列表
116 |         :param response: Response对象
117 |         """
118 |         result = json.loads(response.text)
119 |         if result.get('ok') and result.get('data').get('cards'):
120 |             weibos = result.get('data').get('cards')
121 |             for weibo in weibos:
122 |                 mblog = weibo.get('mblog')
123 |                 if mblog:
124 |                     weibo_item = WeiboItem()
125 |                     field_map = {
126 |                         'id': 'id', 'attitudes_count': 'attitudes_count', 'comments_count': 'comments_count',
127 |                         'reposts_count': 'reposts_count', 'picture': 'original_pic', 'pictures': 'pics',
128 |                         'created_at': 'created_at', 'source': 'source', 'text': 'text', 'raw_text': 'raw_text',
129 |                         'thumbnail': 'thumbnail_pic',
130 |                     }
131 |                     for field, attr in field_map.items():
132 |                         weibo_item[field] = mblog.get(attr)
133 |                     weibo_item['user'] = response.meta.get('uid')
134 |                     yield weibo_item
135 |             # 下一页微博
136 |             uid = response.meta.get('uid')
137 |             page = response.meta.get('page') + 1
138 |             yield Request(self.weibo_url.format(uid=uid, page=page), callback=self.parse_weibos,
139 |                           meta={'uid': uid, 'page': page})
140 | 


--------------------------------------------------------------------------------