├── sina
    ├── __init__.py
    ├── spiders
    │   ├── __init__.py
    │   └── weibo.py
    ├── items.py
    ├── settings.py
    ├── pipelines.py
    └── middlewares.py
├── .gitignore
├── readme
├── scrapy.cfg
└── README.md


/sina/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | *.py[cod]
2 | *.so
3 | *.egg
4 | *.egg-info
5 | 


--------------------------------------------------------------------------------
/readme:
--------------------------------------------------------------------------------
1 | 新浪微博爬虫
2 | 
3 | 更多请见:http://cuiqingcai.com/4465.html
4 | 


--------------------------------------------------------------------------------
/sina/spiders/__init__.py:
--------------------------------------------------------------------------------
1 | # This package will contain the spiders of your Scrapy project
2 | #
3 | # Please refer to the documentation for information on how to create and manage
4 | # your spiders.
5 | 


--------------------------------------------------------------------------------
/scrapy.cfg:
--------------------------------------------------------------------------------
 1 | # Automatically created by: scrapy startproject
 2 | #
 3 | # For more information about the [deploy] section see:
 4 | # https://scrapyd.readthedocs.org/en/latest/deploy.html
 5 | 
 6 | [settings]
 7 | default = sina.settings
 8 | 
 9 | [deploy]
10 | #url = http://localhost:6800/
11 | project = sina
12 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | **新浪微博爬虫 **
 2 | 
 3 | **基础环境:**
 4 | 
 5 |     mongodb
 6 |     scrapy
 7 | 
 8 | 
 9 | **操作系统:**
10 | 
11 |     ubuntu 16
12 | 
13 | **可爬取信息:**
14 | 
15 |     微博博文信息
16 |     微博评论信息
17 |     微博博主也是可以,只是我没有写
18 | 
19 | **注意点:**
20 |     一般本机测试的时候,可以跑那么一会会,但是后面会被封ip,所以最好还是搞个代理比较好,
21 |     不知道哪家代理靠谱,为什么不自己搞个呢? 
22 |     见: [ http://cuiqingcai.com/4596.html]( http://cuiqingcai.com/4596.html) 
23 | 
24 | 
25 | 更多代码解释, 请移步: http://cuiqingcai.com/4465.html
26 | 
27 | 同时欢迎加我QQ:549411552  交流拍砖.
28 | 
29 | 


--------------------------------------------------------------------------------
/sina/items.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | # Created by shimeng on 17-5-19
 4 | 
 5 | from scrapy import Item, Field
 6 | 
 7 | 
 8 | class WeiboItem(Item):
 9 |     table_name = 'weibos'
10 | 
11 |     _id = Field()
12 |     content = Field()
13 |     crawl_time = Field()
14 |     url = Field()
15 |     post_create_time = Field()
16 | 
17 | 
18 | class CommentItem(Item):
19 |     table_name = 'comments'
20 |     _id = Field()
21 |     post_id = Field()
22 |     comment = Field()
23 |     refer = Field()
24 | 


--------------------------------------------------------------------------------
/sina/settings.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | # Created by shimeng on 17-5-19l
 4 | 
 5 | BOT_NAME = 'sina'
 6 | 
 7 | SPIDER_MODULES = ['sina.spiders']
 8 | NEWSPIDER_MODULE = 'sina.spiders'
 9 | 
10 | 
11 | ROBOTSTXT_OBEY = False
12 | 
13 | DEFAULT_REQUEST_HEADERS = {
14 |   'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/57.0.2987.133 Safari/537.36',
15 |   'Host':'m.weibo.cn'
16 | }
17 | 
18 | ITEM_PIPELINES = {
19 |     'sina.pipelines.MongoPipeline': 301,
20 | }
21 | 
22 | MONGO_URI = 'mongodb://localhost'
23 | 
24 | MONGO_DATABASE = 'sina_weibo'


--------------------------------------------------------------------------------
/sina/pipelines.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | # Created by shimeng on 17-5-19
 4 | import pymongo
 5 | 
 6 | 
 7 | class MongoPipeline(object):
 8 |     def __init__(self, mongo_uri, mongo_db):
 9 |         self.mongo_uri = mongo_uri
10 |         self.mongo_db = mongo_db
11 | 
12 |     @classmethod
13 |     def from_crawler(cls, crawler):
14 |         return cls(
15 |             mongo_uri=crawler.settings.get('MONGO_URI'),
16 |             mongo_db=crawler.settings.get('MONGO_DATABASE')
17 |         )
18 | 
19 |     def open_spider(self, spider):
20 |         self.client = pymongo.MongoClient(self.mongo_uri)
21 |         self.db = self.client[self.mongo_db]
22 | 
23 |     def close_spider(self, spider):
24 |         self.client.close()
25 | 
26 |     def process_item(self, item, spider):
27 |         self.db[item.table_name].update({'_id':item['_id']},dict(item),True)


--------------------------------------------------------------------------------
/sina/middlewares.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Define here the models for your spider middleware
 4 | #
 5 | # See documentation in:
 6 | # http://doc.scrapy.org/en/latest/topics/spider-middleware.html
 7 | 
 8 | from scrapy import signals
 9 | 
10 | 
11 | class SinaSpiderMiddleware(object):
12 |     # Not all methods need to be defined. If a method is not defined,
13 |     # scrapy acts as if the spider middleware does not modify the
14 |     # passed objects.
15 | 
16 |     @classmethod
17 |     def from_crawler(cls, crawler):
18 |         # This method is used by Scrapy to create your spiders.
19 |         s = cls()
20 |         crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
21 |         return s
22 | 
23 |     def process_spider_input(response, spider):
24 |         # Called for each response that goes through the spider
25 |         # middleware and into the spider.
26 | 
27 |         # Should return None or raise an exception.
28 |         return None
29 | 
30 |     def process_spider_output(response, result, spider):
31 |         # Called with the results returned from the Spider, after
32 |         # it has processed the response.
33 | 
34 |         # Must return an iterable of Request, dict or Item objects.
35 |         for i in result:
36 |             yield i
37 | 
38 |     def process_spider_exception(response, exception, spider):
39 |         # Called when a spider or process_spider_input() method
40 |         # (from other spider middleware) raises an exception.
41 | 
42 |         # Should return either None or an iterable of Response, dict
43 |         # or Item objects.
44 |         pass
45 | 
46 |     def process_start_requests(start_requests, spider):
47 |         # Called with the start requests of the spider, and works
48 |         # similarly to the process_spider_output() method, except
49 |         # that it doesn’t have a response associated.
50 | 
51 |         # Must return only requests (not items).
52 |         for r in start_requests:
53 |             yield r
54 | 
55 |     def spider_opened(self, spider):
56 |         spider.logger.info('Spider opened: %s' % spider.name)
57 | 


--------------------------------------------------------------------------------
/sina/spiders/weibo.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # -*- coding: utf-8 -*-
  3 | # Created by shimeng on 17-5-19
  4 | 
  5 | 
  6 | import scrapy
  7 | import json
  8 | from scrapy import Request
  9 | import re
 10 | import time
 11 | from sina.items import CommentItem, WeiboItem
 12 | from urlparse import parse_qs
 13 | 
 14 | 
 15 | class WeiboSpider(scrapy.Spider):
 16 |     name = "weibo"
 17 |     allowed_domains = ["m.weibo.cn"]
 18 |     # root id
 19 |     first_id = '1713926427'
 20 |     init_url = 'https://m.weibo.cn/api/container/getIndex?type=uid&value={}'
 21 |     followers_url = 'https://m.weibo.cn/api/container/getIndex?containerid=231051_-_followers_-_{uid}&uid={uid}&page={page}'
 22 |     def start_requests(self):
 23 | 
 24 |         url = self.init_url.format(self.first_id)
 25 |         yield scrapy.Request(url=url, callback=self.get_containerid)
 26 | 
 27 |     def get_containerid(self, response):
 28 |         content = json.loads(response.body)
 29 |         if content.get('userInfo'):
 30 |             user_info = content.get('userInfo')
 31 |             # 关注url
 32 |             print 'user id is %s' % user_info.get('id')
 33 |             yield Request(self.followers_url.format(uid=user_info.get('id'), page=1), callback=self.parse_followers_to_get_more_ids)
 34 | 
 35 |         containerid = None
 36 |         for data in content.get('tabsInfo').get('tabs'):
 37 |             if data.get('tab_type') == 'weibo':
 38 |                 containerid = data.get('containerid')
 39 |                 print 'weibo request url containerid is %s' % containerid
 40 | 
 41 |         # construct the wei bo request url
 42 |         if containerid:
 43 |             weibo_url = response.url + '&containerid=%s' % containerid
 44 |             yield scrapy.Request(url=weibo_url, callback=self.get_weibo_id)
 45 |         else:
 46 |             print 'sorry, do not get containerid'
 47 | 
 48 |     def parse_followers_to_get_more_ids(self, response):
 49 |         content = json.loads(response.body)
 50 |         if content.get('ok'):
 51 |             followers = content.get('cards')[0].get('card_group')
 52 |             for follower in followers:
 53 |                 user_id = follower.get('user').get('id')
 54 |                 yield Request(self.init_url.format(user_id), callback=self.get_containerid)
 55 | 
 56 |             params = parse_qs(response.url)
 57 |             page = str(int(params.get('page')[0]) + 1) if params.get('page') else '2'
 58 | 
 59 |             yield Request(self.followers_url.format(uid=params.get('uid')[0], page=page), callback=self.parse_followers_to_get_more_ids)
 60 | 
 61 |     def get_weibo_id(self, response):
 62 |         content = json.loads(response.body)
 63 |         # get weibo id ,you can also save some other data if you need
 64 |         for data in content.get('cards'):
 65 |             if data.get('card_type') == 9:
 66 |                 single_weibo_id = data.get('mblog').get('id')
 67 |                 print single_weibo_id
 68 | 
 69 |                 post_create_time = data.get('mblog').get('created_at')
 70 |                 post_comment_url = 'https://m.weibo.cn/api/comments/show?id=%s&page=1' % single_weibo_id
 71 |                 yield Request(url=post_comment_url, callback=self.get_comment_content)
 72 | 
 73 |                 post_content_url = 'https://m.weibo.cn/statuses/extend?id=%s' % single_weibo_id
 74 |                 yield Request(url=post_content_url, callback=self.get_post_content,
 75 |                               meta={"post_create_time": post_create_time})
 76 | 
 77 |     def get_post_content(self, response):
 78 |         post_id = re.findall('(\d+)', response.url)[0]
 79 |         post_url = 'https://m.weibo.cn/status/%s' % (str(post_id))
 80 |         post_create_time = response.meta.get("post_create_time")
 81 |         content = json.loads(response.body)
 82 |         item = WeiboItem()
 83 |         post_content = re.sub(r'<.*?>', '', content.get('longTextContent'))
 84 |         item['_id'] = post_id
 85 |         item['content'] = post_content
 86 |         item['crawl_time'] = time.strftime('%Y-%m-%d %H:%M:%S')
 87 |         item['url'] = post_url
 88 |         item['post_create_time'] = post_create_time
 89 | 
 90 |         yield item
 91 | 
 92 |     def get_comment_content(self, response):
 93 |         content = json.loads(response.body)
 94 |         # get comment text
 95 |         for data in content.get('data'):
 96 |             post_id = re.findall('(\d+)', response.url)[0]
 97 |             _id = data.get('id')
 98 |             text = re.sub('<.*?>', '', data.get('text'))
 99 |             text_2 = re.sub(r'.*?@.*?:', '', text)
100 |             reply_text = re.sub('.*?@.*?:', '', re.sub('<.*?>', '', data.get('reply_text', '')))
101 | 
102 |             item = CommentItem()
103 |             item['_id'] = _id
104 |             item['comment'] = text_2
105 |             item['refer'] = reply_text
106 |             item['post_id'] = post_id
107 |             yield item
108 | 
109 |         max_page = content.get('max')
110 |         page_num_pattern = r'(\d+)'
111 |         page_num = re.findall(page_num_pattern, response.url)[1]
112 |         if int(max_page) > 1 and int(max_page) > int(page_num):
113 |             post_id_pattern = r'.*?id=(\d+)&page=.*?'
114 |             post_id = re.findall(post_id_pattern, response.url)[0]
115 |             url = 'https://m.weibo.cn/api/comments/show?id=%s&page=%s' % (post_id, str(int(page_num) + 1))
116 |             yield Request(url=url, callback=self.get_comment_content)
117 | 


--------------------------------------------------------------------------------