├── .gitignore ├── README.md ├── scrapy.cfg └── weibo ├── __init__.py ├── items.py ├── middlewares.py ├── pipelines.py ├── settings.py └── spiders ├── __init__.py └── weibocn.py /.gitignore: -------------------------------------------------------------------------------- 1 | /.idea 2 | *.pyc 3 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Weibo 2 | 3 | Weibo Spider Using Scrapy -------------------------------------------------------------------------------- /scrapy.cfg: -------------------------------------------------------------------------------- 1 | # Automatically created by: scrapy startproject 2 | # 3 | # For more information about the [deploy] section see: 4 | # https://scrapyd.readthedocs.org/en/latest/deploy.html 5 | 6 | [settings] 7 | default = weibo.settings 8 | 9 | [deploy] 10 | #url = http://localhost:6800/ 11 | project = weibo 12 | -------------------------------------------------------------------------------- /weibo/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Python3WebSpider/Weibo/bbc33e4907ba253960b5bff3fa9c16c4ca84a4a6/weibo/__init__.py -------------------------------------------------------------------------------- /weibo/items.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define here the models for your scraped items 4 | # 5 | # See documentation in: 6 | # http://doc.scrapy.org/en/latest/topics/items.html 7 | 8 | from scrapy import Item, Field 9 | 10 | 11 | class UserItem(Item): 12 | collection = 'users' 13 | 14 | id = Field() 15 | name = Field() 16 | avatar = Field() 17 | cover = Field() 18 | gender = Field() 19 | description = Field() 20 | fans_count = Field() 21 | follows_count = Field() 22 | weibos_count = Field() 23 | verified = Field() 24 | verified_reason = Field() 25 | verified_type = Field() 26 | follows = Field() 27 | fans = Field() 28 | crawled_at = Field() 29 | 30 | 31 | class UserRelationItem(Item): 32 | collection = 'users' 33 | 34 | id = Field() 35 | follows = Field() 36 | fans = Field() 37 | 38 | 39 | class WeiboItem(Item): 40 | collection = 'weibos' 41 | 42 | id = Field() 43 | attitudes_count = Field() 44 | comments_count = Field() 45 | reposts_count = Field() 46 | picture = Field() 47 | pictures = Field() 48 | source = Field() 49 | text = Field() 50 | raw_text = Field() 51 | thumbnail = Field() 52 | user = Field() 53 | created_at = Field() 54 | crawled_at = Field() 55 | 56 | 57 | -------------------------------------------------------------------------------- /weibo/middlewares.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define here the models for your spider middleware 4 | # 5 | # See documentation in: 6 | # http://doc.scrapy.org/en/latest/topics/spider-middleware.html 7 | import json 8 | import logging 9 | from scrapy import signals 10 | import requests 11 | 12 | 13 | class ProxyMiddleware(): 14 | def __init__(self, proxy_url): 15 | self.logger = logging.getLogger(__name__) 16 | self.proxy_url = proxy_url 17 | 18 | def get_random_proxy(self): 19 | try: 20 | response = requests.get(self.proxy_url) 21 | if response.status_code == 200: 22 | proxy = response.text 23 | return proxy 24 | except requests.ConnectionError: 25 | return False 26 | 27 | def process_request(self, request, spider): 28 | if request.meta.get('retry_times'): 29 | proxy = self.get_random_proxy() 30 | if proxy: 31 | uri = 'https://{proxy}'.format(proxy=proxy) 32 | self.logger.debug('使用代理 ' + proxy) 33 | request.meta['proxy'] = uri 34 | 35 | @classmethod 36 | def from_crawler(cls, crawler): 37 | settings = crawler.settings 38 | return cls( 39 | proxy_url=settings.get('PROXY_URL') 40 | ) 41 | 42 | 43 | class CookiesMiddleware(): 44 | def __init__(self, cookies_url): 45 | self.logger = logging.getLogger(__name__) 46 | self.cookies_url = cookies_url 47 | 48 | def get_random_cookies(self): 49 | try: 50 | response = requests.get(self.cookies_url) 51 | if response.status_code == 200: 52 | cookies = json.loads(response.text) 53 | return cookies 54 | except requests.ConnectionError: 55 | return False 56 | 57 | def process_request(self, request, spider): 58 | self.logger.debug('正在获取Cookies') 59 | cookies = self.get_random_cookies() 60 | if cookies: 61 | request.cookies = cookies 62 | self.logger.debug('使用Cookies ' + json.dumps(cookies)) 63 | 64 | @classmethod 65 | def from_crawler(cls, crawler): 66 | settings = crawler.settings 67 | return cls( 68 | cookies_url=settings.get('COOKIES_URL') 69 | ) 70 | -------------------------------------------------------------------------------- /weibo/pipelines.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define your item pipelines here 4 | # 5 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting 6 | # See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html 7 | import re, time 8 | 9 | import logging 10 | import pymongo 11 | 12 | from weibo.items import * 13 | 14 | 15 | class TimePipeline(): 16 | def process_item(self, item, spider): 17 | if isinstance(item, UserItem) or isinstance(item, WeiboItem): 18 | now = time.strftime('%Y-%m-%d %H:%M', time.localtime()) 19 | item['crawled_at'] = now 20 | return item 21 | 22 | 23 | class WeiboPipeline(): 24 | def parse_time(self, date): 25 | if re.match('刚刚', date): 26 | date = time.strftime('%Y-%m-%d %H:%M', time.localtime(time.time())) 27 | if re.match('\d+分钟前', date): 28 | minute = re.match('(\d+)', date).group(1) 29 | date = time.strftime('%Y-%m-%d %H:%M', time.localtime(time.time() - float(minute) * 60)) 30 | if re.match('\d+小时前', date): 31 | hour = re.match('(\d+)', date).group(1) 32 | date = time.strftime('%Y-%m-%d %H:%M', time.localtime(time.time() - float(hour) * 60 * 60)) 33 | if re.match('昨天.*', date): 34 | date = re.match('昨天(.*)', date).group(1).strip() 35 | date = time.strftime('%Y-%m-%d', time.localtime() - 24 * 60 * 60) + ' ' + date 36 | if re.match('\d{2}-\d{2}', date): 37 | date = time.strftime('%Y-', time.localtime()) + date + ' 00:00' 38 | return date 39 | 40 | def process_item(self, item, spider): 41 | if isinstance(item, WeiboItem): 42 | if item.get('created_at'): 43 | item['created_at'] = item['created_at'].strip() 44 | item['created_at'] = self.parse_time(item.get('created_at')) 45 | if item.get('pictures'): 46 | item['pictures'] = [pic.get('url') for pic in item.get('pictures')] 47 | return item 48 | 49 | 50 | class MongoPipeline(object): 51 | def __init__(self, mongo_uri, mongo_db): 52 | self.mongo_uri = mongo_uri 53 | self.mongo_db = mongo_db 54 | 55 | @classmethod 56 | def from_crawler(cls, crawler): 57 | return cls( 58 | mongo_uri=crawler.settings.get('MONGO_URI'), 59 | mongo_db=crawler.settings.get('MONGO_DATABASE') 60 | ) 61 | 62 | def open_spider(self, spider): 63 | self.client = pymongo.MongoClient(self.mongo_uri) 64 | self.db = self.client[self.mongo_db] 65 | self.db[UserItem.collection].create_index([('id', pymongo.ASCENDING)]) 66 | self.db[WeiboItem.collection].create_index([('id', pymongo.ASCENDING)]) 67 | 68 | def close_spider(self, spider): 69 | self.client.close() 70 | 71 | def process_item(self, item, spider): 72 | if isinstance(item, UserItem) or isinstance(item, WeiboItem): 73 | self.db[item.collection].update({'id': item.get('id')}, {'$set': item}, True) 74 | if isinstance(item, UserRelationItem): 75 | self.db[item.collection].update( 76 | {'id': item.get('id')}, 77 | {'$addToSet': 78 | { 79 | 'follows': {'$each': item['follows']}, 80 | 'fans': {'$each': item['fans']} 81 | } 82 | }, True) 83 | return item 84 | -------------------------------------------------------------------------------- /weibo/settings.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Scrapy settings for weibo project 4 | # 5 | # For simplicity, this file contains only settings considered important or 6 | # commonly used. You can find more settings consulting the documentation: 7 | # 8 | # http://doc.scrapy.org/en/latest/topics/settings.html 9 | # http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html 10 | # http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html 11 | 12 | BOT_NAME = 'weibo' 13 | 14 | SPIDER_MODULES = ['weibo.spiders'] 15 | NEWSPIDER_MODULE = 'weibo.spiders' 16 | 17 | # Crawl responsibly by identifying yourself (and your website) on the user-agent 18 | # USER_AGENT = 'weibo (+http://www.yourdomain.com)' 19 | 20 | # Obey robots.txt rules 21 | ROBOTSTXT_OBEY = False 22 | 23 | DEFAULT_REQUEST_HEADERS = { 24 | 'Accept': 'application/json, text/plain, */*', 25 | 'Accept-Encoding': 'gzip, deflate, sdch', 26 | 'Accept-Language': 'zh-CN,zh;q=0.8,en;q=0.6,ja;q=0.4,zh-TW;q=0.2,mt;q=0.2', 27 | 'Connection': 'keep-alive', 28 | 'Host': 'm.weibo.cn', 29 | 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.115 Safari/537.36', 30 | 'X-Requested-With': 'XMLHttpRequest', 31 | } 32 | 33 | # Configure maximum concurrent requests performed by Scrapy (default: 16) 34 | # CONCURRENT_REQUESTS = 32 35 | 36 | # Configure a delay for requests for the same website (default: 0) 37 | # See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay 38 | # See also autothrottle settings and docs 39 | # DOWNLOAD_DELAY = 3 40 | # The download delay setting will honor only one of: 41 | # CONCURRENT_REQUESTS_PER_DOMAIN = 16 42 | # CONCURRENT_REQUESTS_PER_IP = 16 43 | 44 | # Disable cookies (enabled by default) 45 | # COOKIES_ENABLED = False 46 | 47 | # Disable Telnet Console (enabled by default) 48 | # TELNETCONSOLE_ENABLED = False 49 | 50 | # Override the default request headers: 51 | # DEFAULT_REQUEST_HEADERS = { 52 | # 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 53 | # 'Accept-Language': 'en', 54 | # } 55 | 56 | # Enable or disable spider middlewares 57 | # See http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html 58 | # SPIDER_MIDDLEWARES = { 59 | # 'weibo.middlewares.WeiboSpiderMiddleware': 543, 60 | # } 61 | 62 | # Enable or disable downloader middlewares 63 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html 64 | DOWNLOADER_MIDDLEWARES = { 65 | 'weibo.middlewares.CookiesMiddleware': 554, 66 | 'weibo.middlewares.ProxyMiddleware': 555, 67 | } 68 | 69 | # Enable or disable extensions 70 | # See http://scrapy.readthedocs.org/en/latest/topics/extensions.html 71 | # EXTENSIONS = { 72 | # 'scrapy.extensions.telnet.TelnetConsole': None, 73 | # } 74 | 75 | # Configure item pipelines 76 | # See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html 77 | ITEM_PIPELINES = { 78 | 'weibo.pipelines.TimePipeline': 300, 79 | 'weibo.pipelines.WeiboPipeline': 301, 80 | 'weibo.pipelines.MongoPipeline': 302, 81 | } 82 | 83 | # Enable and configure the AutoThrottle extension (disabled by default) 84 | # See http://doc.scrapy.org/en/latest/topics/autothrottle.html 85 | # AUTOTHROTTLE_ENABLED = True 86 | # The initial download delay 87 | # AUTOTHROTTLE_START_DELAY = 5 88 | # The maximum download delay to be set in case of high latencies 89 | # AUTOTHROTTLE_MAX_DELAY = 60 90 | # The average number of requests Scrapy should be sending in parallel to 91 | # each remote server 92 | # AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0 93 | # Enable showing throttling stats for every response received: 94 | # AUTOTHROTTLE_DEBUG = False 95 | 96 | # Enable and configure HTTP caching (disabled by default) 97 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings 98 | # HTTPCACHE_ENABLED = True 99 | # HTTPCACHE_EXPIRATION_SECS = 0 100 | # HTTPCACHE_DIR = 'httpcache' 101 | # HTTPCACHE_IGNORE_HTTP_CODES = [] 102 | # HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage' 103 | 104 | 105 | MONGO_URI = 'localhost' 106 | 107 | MONGO_DATABASE = 'weibo' 108 | 109 | COOKIES_URL = 'http://localhost:5000/weibo/random' 110 | 111 | PROXY_URL = 'http://localhost:5555/random' 112 | 113 | RETRY_HTTP_CODES = [401, 403, 408, 414, 500, 502, 503, 504] 114 | -------------------------------------------------------------------------------- /weibo/spiders/__init__.py: -------------------------------------------------------------------------------- 1 | # This package will contain the spiders of your Scrapy project 2 | # 3 | # Please refer to the documentation for information on how to create and manage 4 | # your spiders. 5 | -------------------------------------------------------------------------------- /weibo/spiders/weibocn.py: -------------------------------------------------------------------------------- 1 | import json 2 | from scrapy import Request, Spider 3 | from weibo.items import * 4 | 5 | 6 | class WeiboSpider(Spider): 7 | name = 'weibocn' 8 | 9 | allowed_domains = ['m.weibo.cn'] 10 | 11 | user_url = 'https://m.weibo.cn/api/container/getIndex?uid={uid}&type=uid&value={uid}&containerid=100505{uid}' 12 | 13 | follow_url = 'https://m.weibo.cn/api/container/getIndex?containerid=231051_-_followers_-_{uid}&page={page}' 14 | 15 | fan_url = 'https://m.weibo.cn/api/container/getIndex?containerid=231051_-_fans_-_{uid}&page={page}' 16 | 17 | weibo_url = 'https://m.weibo.cn/api/container/getIndex?uid={uid}&type=uid&page={page}&containerid=107603{uid}' 18 | 19 | start_users = ['3217179555', '1742566624', '2282991915', '1288739185', '3952070245', '5878659096'] 20 | 21 | def start_requests(self): 22 | for uid in self.start_users: 23 | yield Request(self.user_url.format(uid=uid), callback=self.parse_user) 24 | 25 | def parse_user(self, response): 26 | """ 27 | 解析用户信息 28 | :param response: Response对象 29 | """ 30 | self.logger.debug(response) 31 | result = json.loads(response.text) 32 | if result.get('data').get('userInfo'): 33 | user_info = result.get('data').get('userInfo') 34 | user_item = UserItem() 35 | field_map = { 36 | 'id': 'id', 'name': 'screen_name', 'avatar': 'profile_image_url', 'cover': 'cover_image_phone', 37 | 'gender': 'gender', 'description': 'description', 'fans_count': 'followers_count', 38 | 'follows_count': 'follow_count', 'weibos_count': 'statuses_count', 'verified': 'verified', 39 | 'verified_reason': 'verified_reason', 'verified_type': 'verified_type' 40 | } 41 | for field, attr in field_map.items(): 42 | user_item[field] = user_info.get(attr) 43 | yield user_item 44 | # 关注 45 | uid = user_info.get('id') 46 | yield Request(self.follow_url.format(uid=uid, page=1), callback=self.parse_follows, 47 | meta={'page': 1, 'uid': uid}) 48 | # 粉丝 49 | yield Request(self.fan_url.format(uid=uid, page=1), callback=self.parse_fans, 50 | meta={'page': 1, 'uid': uid}) 51 | # 微博 52 | yield Request(self.weibo_url.format(uid=uid, page=1), callback=self.parse_weibos, 53 | meta={'page': 1, 'uid': uid}) 54 | 55 | def parse_follows(self, response): 56 | """ 57 | 解析用户关注 58 | :param response: Response对象 59 | """ 60 | result = json.loads(response.text) 61 | if result.get('ok') and result.get('data').get('cards') and len(result.get('data').get('cards')) and result.get('data').get('cards')[-1].get( 62 | 'card_group'): 63 | # 解析用户 64 | follows = result.get('data').get('cards')[-1].get('card_group') 65 | for follow in follows: 66 | if follow.get('user'): 67 | uid = follow.get('user').get('id') 68 | yield Request(self.user_url.format(uid=uid), callback=self.parse_user) 69 | 70 | uid = response.meta.get('uid') 71 | # 关注列表 72 | user_relation_item = UserRelationItem() 73 | follows = [{'id': follow.get('user').get('id'), 'name': follow.get('user').get('screen_name')} for follow in 74 | follows] 75 | user_relation_item['id'] = uid 76 | user_relation_item['follows'] = follows 77 | user_relation_item['fans'] = [] 78 | yield user_relation_item 79 | # 下一页关注 80 | page = response.meta.get('page') + 1 81 | yield Request(self.follow_url.format(uid=uid, page=page), 82 | callback=self.parse_follows, meta={'page': page, 'uid': uid}) 83 | 84 | def parse_fans(self, response): 85 | """ 86 | 解析用户粉丝 87 | :param response: Response对象 88 | """ 89 | result = json.loads(response.text) 90 | if result.get('ok') and result.get('data').get('cards') and len(result.get('data').get('cards')) and result.get('data').get('cards')[-1].get( 91 | 'card_group'): 92 | # 解析用户 93 | fans = result.get('data').get('cards')[-1].get('card_group') 94 | for fan in fans: 95 | if fan.get('user'): 96 | uid = fan.get('user').get('id') 97 | yield Request(self.user_url.format(uid=uid), callback=self.parse_user) 98 | 99 | uid = response.meta.get('uid') 100 | # 粉丝列表 101 | user_relation_item = UserRelationItem() 102 | fans = [{'id': fan.get('user').get('id'), 'name': fan.get('user').get('screen_name')} for fan in 103 | fans] 104 | user_relation_item['id'] = uid 105 | user_relation_item['fans'] = fans 106 | user_relation_item['follows'] = [] 107 | yield user_relation_item 108 | # 下一页粉丝 109 | page = response.meta.get('page') + 1 110 | yield Request(self.fan_url.format(uid=uid, page=page), 111 | callback=self.parse_fans, meta={'page': page, 'uid': uid}) 112 | 113 | def parse_weibos(self, response): 114 | """ 115 | 解析微博列表 116 | :param response: Response对象 117 | """ 118 | result = json.loads(response.text) 119 | if result.get('ok') and result.get('data').get('cards'): 120 | weibos = result.get('data').get('cards') 121 | for weibo in weibos: 122 | mblog = weibo.get('mblog') 123 | if mblog: 124 | weibo_item = WeiboItem() 125 | field_map = { 126 | 'id': 'id', 'attitudes_count': 'attitudes_count', 'comments_count': 'comments_count', 127 | 'reposts_count': 'reposts_count', 'picture': 'original_pic', 'pictures': 'pics', 128 | 'created_at': 'created_at', 'source': 'source', 'text': 'text', 'raw_text': 'raw_text', 129 | 'thumbnail': 'thumbnail_pic', 130 | } 131 | for field, attr in field_map.items(): 132 | weibo_item[field] = mblog.get(attr) 133 | weibo_item['user'] = response.meta.get('uid') 134 | yield weibo_item 135 | # 下一页微博 136 | uid = response.meta.get('uid') 137 | page = response.meta.get('page') + 1 138 | yield Request(self.weibo_url.format(uid=uid, page=page), callback=self.parse_weibos, 139 | meta={'uid': uid, 'page': page}) 140 | --------------------------------------------------------------------------------