├── sina ├── __init__.py ├── spiders │ ├── __init__.py │ └── weibo.py ├── items.py ├── settings.py ├── pipelines.py └── middlewares.py ├── .gitignore ├── readme ├── scrapy.cfg └── README.md /sina/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | *.py[cod] 2 | *.so 3 | *.egg 4 | *.egg-info 5 | -------------------------------------------------------------------------------- /readme: -------------------------------------------------------------------------------- 1 | 新浪微博爬虫 2 | 3 | 更多请见:http://cuiqingcai.com/4465.html 4 | -------------------------------------------------------------------------------- /sina/spiders/__init__.py: -------------------------------------------------------------------------------- 1 | # This package will contain the spiders of your Scrapy project 2 | # 3 | # Please refer to the documentation for information on how to create and manage 4 | # your spiders. 5 | -------------------------------------------------------------------------------- /scrapy.cfg: -------------------------------------------------------------------------------- 1 | # Automatically created by: scrapy startproject 2 | # 3 | # For more information about the [deploy] section see: 4 | # https://scrapyd.readthedocs.org/en/latest/deploy.html 5 | 6 | [settings] 7 | default = sina.settings 8 | 9 | [deploy] 10 | #url = http://localhost:6800/ 11 | project = sina 12 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | **新浪微博爬虫 ** 2 | 3 | **基础环境:** 4 | 5 | mongodb 6 | scrapy 7 | 8 | 9 | **操作系统:** 10 | 11 | ubuntu 16 12 | 13 | **可爬取信息:** 14 | 15 | 微博博文信息 16 | 微博评论信息 17 | 微博博主也是可以,只是我没有写 18 | 19 | **注意点:** 20 | 一般本机测试的时候,可以跑那么一会会,但是后面会被封ip,所以最好还是搞个代理比较好, 21 | 不知道哪家代理靠谱,为什么不自己搞个呢? 22 | 见: [ http://cuiqingcai.com/4596.html]( http://cuiqingcai.com/4596.html) 23 | 24 | 25 | 更多代码解释, 请移步: http://cuiqingcai.com/4465.html 26 | 27 | 同时欢迎加我QQ:549411552 交流拍砖. 28 | 29 | -------------------------------------------------------------------------------- /sina/items.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | # Created by shimeng on 17-5-19 4 | 5 | from scrapy import Item, Field 6 | 7 | 8 | class WeiboItem(Item): 9 | table_name = 'weibos' 10 | 11 | _id = Field() 12 | content = Field() 13 | crawl_time = Field() 14 | url = Field() 15 | post_create_time = Field() 16 | 17 | 18 | class CommentItem(Item): 19 | table_name = 'comments' 20 | _id = Field() 21 | post_id = Field() 22 | comment = Field() 23 | refer = Field() 24 | -------------------------------------------------------------------------------- /sina/settings.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | # Created by shimeng on 17-5-19l 4 | 5 | BOT_NAME = 'sina' 6 | 7 | SPIDER_MODULES = ['sina.spiders'] 8 | NEWSPIDER_MODULE = 'sina.spiders' 9 | 10 | 11 | ROBOTSTXT_OBEY = False 12 | 13 | DEFAULT_REQUEST_HEADERS = { 14 | 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/57.0.2987.133 Safari/537.36', 15 | 'Host':'m.weibo.cn' 16 | } 17 | 18 | ITEM_PIPELINES = { 19 | 'sina.pipelines.MongoPipeline': 301, 20 | } 21 | 22 | MONGO_URI = 'mongodb://localhost' 23 | 24 | MONGO_DATABASE = 'sina_weibo' -------------------------------------------------------------------------------- /sina/pipelines.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | # Created by shimeng on 17-5-19 4 | import pymongo 5 | 6 | 7 | class MongoPipeline(object): 8 | def __init__(self, mongo_uri, mongo_db): 9 | self.mongo_uri = mongo_uri 10 | self.mongo_db = mongo_db 11 | 12 | @classmethod 13 | def from_crawler(cls, crawler): 14 | return cls( 15 | mongo_uri=crawler.settings.get('MONGO_URI'), 16 | mongo_db=crawler.settings.get('MONGO_DATABASE') 17 | ) 18 | 19 | def open_spider(self, spider): 20 | self.client = pymongo.MongoClient(self.mongo_uri) 21 | self.db = self.client[self.mongo_db] 22 | 23 | def close_spider(self, spider): 24 | self.client.close() 25 | 26 | def process_item(self, item, spider): 27 | self.db[item.table_name].update({'_id':item['_id']},dict(item),True) -------------------------------------------------------------------------------- /sina/middlewares.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define here the models for your spider middleware 4 | # 5 | # See documentation in: 6 | # http://doc.scrapy.org/en/latest/topics/spider-middleware.html 7 | 8 | from scrapy import signals 9 | 10 | 11 | class SinaSpiderMiddleware(object): 12 | # Not all methods need to be defined. If a method is not defined, 13 | # scrapy acts as if the spider middleware does not modify the 14 | # passed objects. 15 | 16 | @classmethod 17 | def from_crawler(cls, crawler): 18 | # This method is used by Scrapy to create your spiders. 19 | s = cls() 20 | crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) 21 | return s 22 | 23 | def process_spider_input(response, spider): 24 | # Called for each response that goes through the spider 25 | # middleware and into the spider. 26 | 27 | # Should return None or raise an exception. 28 | return None 29 | 30 | def process_spider_output(response, result, spider): 31 | # Called with the results returned from the Spider, after 32 | # it has processed the response. 33 | 34 | # Must return an iterable of Request, dict or Item objects. 35 | for i in result: 36 | yield i 37 | 38 | def process_spider_exception(response, exception, spider): 39 | # Called when a spider or process_spider_input() method 40 | # (from other spider middleware) raises an exception. 41 | 42 | # Should return either None or an iterable of Response, dict 43 | # or Item objects. 44 | pass 45 | 46 | def process_start_requests(start_requests, spider): 47 | # Called with the start requests of the spider, and works 48 | # similarly to the process_spider_output() method, except 49 | # that it doesn’t have a response associated. 50 | 51 | # Must return only requests (not items). 52 | for r in start_requests: 53 | yield r 54 | 55 | def spider_opened(self, spider): 56 | spider.logger.info('Spider opened: %s' % spider.name) 57 | -------------------------------------------------------------------------------- /sina/spiders/weibo.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | # Created by shimeng on 17-5-19 4 | 5 | 6 | import scrapy 7 | import json 8 | from scrapy import Request 9 | import re 10 | import time 11 | from sina.items import CommentItem, WeiboItem 12 | from urlparse import parse_qs 13 | 14 | 15 | class WeiboSpider(scrapy.Spider): 16 | name = "weibo" 17 | allowed_domains = ["m.weibo.cn"] 18 | # root id 19 | first_id = '1713926427' 20 | init_url = 'https://m.weibo.cn/api/container/getIndex?type=uid&value={}' 21 | followers_url = 'https://m.weibo.cn/api/container/getIndex?containerid=231051_-_followers_-_{uid}&uid={uid}&page={page}' 22 | def start_requests(self): 23 | 24 | url = self.init_url.format(self.first_id) 25 | yield scrapy.Request(url=url, callback=self.get_containerid) 26 | 27 | def get_containerid(self, response): 28 | content = json.loads(response.body) 29 | if content.get('userInfo'): 30 | user_info = content.get('userInfo') 31 | # 关注url 32 | print 'user id is %s' % user_info.get('id') 33 | yield Request(self.followers_url.format(uid=user_info.get('id'), page=1), callback=self.parse_followers_to_get_more_ids) 34 | 35 | containerid = None 36 | for data in content.get('tabsInfo').get('tabs'): 37 | if data.get('tab_type') == 'weibo': 38 | containerid = data.get('containerid') 39 | print 'weibo request url containerid is %s' % containerid 40 | 41 | # construct the wei bo request url 42 | if containerid: 43 | weibo_url = response.url + '&containerid=%s' % containerid 44 | yield scrapy.Request(url=weibo_url, callback=self.get_weibo_id) 45 | else: 46 | print 'sorry, do not get containerid' 47 | 48 | def parse_followers_to_get_more_ids(self, response): 49 | content = json.loads(response.body) 50 | if content.get('ok'): 51 | followers = content.get('cards')[0].get('card_group') 52 | for follower in followers: 53 | user_id = follower.get('user').get('id') 54 | yield Request(self.init_url.format(user_id), callback=self.get_containerid) 55 | 56 | params = parse_qs(response.url) 57 | page = str(int(params.get('page')[0]) + 1) if params.get('page') else '2' 58 | 59 | yield Request(self.followers_url.format(uid=params.get('uid')[0], page=page), callback=self.parse_followers_to_get_more_ids) 60 | 61 | def get_weibo_id(self, response): 62 | content = json.loads(response.body) 63 | # get weibo id ,you can also save some other data if you need 64 | for data in content.get('cards'): 65 | if data.get('card_type') == 9: 66 | single_weibo_id = data.get('mblog').get('id') 67 | print single_weibo_id 68 | 69 | post_create_time = data.get('mblog').get('created_at') 70 | post_comment_url = 'https://m.weibo.cn/api/comments/show?id=%s&page=1' % single_weibo_id 71 | yield Request(url=post_comment_url, callback=self.get_comment_content) 72 | 73 | post_content_url = 'https://m.weibo.cn/statuses/extend?id=%s' % single_weibo_id 74 | yield Request(url=post_content_url, callback=self.get_post_content, 75 | meta={"post_create_time": post_create_time}) 76 | 77 | def get_post_content(self, response): 78 | post_id = re.findall('(\d+)', response.url)[0] 79 | post_url = 'https://m.weibo.cn/status/%s' % (str(post_id)) 80 | post_create_time = response.meta.get("post_create_time") 81 | content = json.loads(response.body) 82 | item = WeiboItem() 83 | post_content = re.sub(r'<.*?>', '', content.get('longTextContent')) 84 | item['_id'] = post_id 85 | item['content'] = post_content 86 | item['crawl_time'] = time.strftime('%Y-%m-%d %H:%M:%S') 87 | item['url'] = post_url 88 | item['post_create_time'] = post_create_time 89 | 90 | yield item 91 | 92 | def get_comment_content(self, response): 93 | content = json.loads(response.body) 94 | # get comment text 95 | for data in content.get('data'): 96 | post_id = re.findall('(\d+)', response.url)[0] 97 | _id = data.get('id') 98 | text = re.sub('<.*?>', '', data.get('text')) 99 | text_2 = re.sub(r'.*?@.*?:', '', text) 100 | reply_text = re.sub('.*?@.*?:', '', re.sub('<.*?>', '', data.get('reply_text', ''))) 101 | 102 | item = CommentItem() 103 | item['_id'] = _id 104 | item['comment'] = text_2 105 | item['refer'] = reply_text 106 | item['post_id'] = post_id 107 | yield item 108 | 109 | max_page = content.get('max') 110 | page_num_pattern = r'(\d+)' 111 | page_num = re.findall(page_num_pattern, response.url)[1] 112 | if int(max_page) > 1 and int(max_page) > int(page_num): 113 | post_id_pattern = r'.*?id=(\d+)&page=.*?' 114 | post_id = re.findall(post_id_pattern, response.url)[0] 115 | url = 'https://m.weibo.cn/api/comments/show?id=%s&page=%s' % (post_id, str(int(page_num) + 1)) 116 | yield Request(url=url, callback=self.get_comment_content) 117 | --------------------------------------------------------------------------------