├── OopSpider └── oop │ ├── oop │ ├── __init__.py │ ├── spiders │ │ ├── __init__.py │ │ └── oop_spider.py │ ├── pipelines.py │ ├── items.py │ ├── middlewares.py │ └── settings.py │ └── scrapy.cfg ├── ZhenAiSpider └── ZhenAi │ ├── ZhenAi │ ├── __init__.py │ ├── spiders │ │ ├── __init__.py │ │ ├── utils.py │ │ ├── mymongo.py │ │ └── zhenai_spider.py │ ├── items.py │ ├── pipelines.py │ ├── middlewares.py │ └── settings.py │ └── scrapy.cfg ├── TMallCommentSpider ├── TMallCommentSpider │ ├── __init__.py │ ├── spiders │ │ ├── __init__.py │ │ ├── config.py │ │ └── tmall_comment_spider.py │ ├── pipelines.py │ ├── items.py │ ├── middlewares.py │ └── settings.py ├── README.md ├── scrapy.cfg ├── tmall_comment_selenium_spider.py └── tmall_comment_spider.py ├── BaiduZhidaoCommentSpider ├── README.md └── baidu_zhidao_comment_spider.py ├── README.md ├── MouserSpider └── myselenium.py ├── WeiboCommentSpider ├── README.md ├── weibocomment.py ├── weibo_comment_result_example.json └── weibo_search_result_example.json ├── JDCommentSpider ├── jdcomment.py └── README.md ├── LICENSE └── ZhihuSpider └── zhihu_spider.py /OopSpider/oop/oop/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /ZhenAiSpider/ZhenAi/ZhenAi/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /TMallCommentSpider/TMallCommentSpider/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /TMallCommentSpider/README.md: -------------------------------------------------------------------------------- 1 | # TMallSpider 2 | 暂时该项目不可用,尚未突破天猫反爬技术 -------------------------------------------------------------------------------- /OopSpider/oop/oop/spiders/__init__.py: -------------------------------------------------------------------------------- 1 | # This package will contain the spiders of your Scrapy project 2 | # 3 | # Please refer to the documentation for information on how to create and manage 4 | # your spiders. 5 | -------------------------------------------------------------------------------- /ZhenAiSpider/ZhenAi/ZhenAi/spiders/__init__.py: -------------------------------------------------------------------------------- 1 | # This package will contain the spiders of your Scrapy project 2 | # 3 | # Please refer to the documentation for information on how to create and manage 4 | # your spiders. 5 | -------------------------------------------------------------------------------- /TMallCommentSpider/TMallCommentSpider/spiders/__init__.py: -------------------------------------------------------------------------------- 1 | # This package will contain the spiders of your Scrapy project 2 | # 3 | # Please refer to the documentation for information on how to create and manage 4 | # your spiders. 5 | -------------------------------------------------------------------------------- /BaiduZhidaoCommentSpider/README.md: -------------------------------------------------------------------------------- 1 | # BaiduZhidaoCommentSpider 2 | 搜索关键字后遍历搜索结果,对评论页进行抓取 3 | 4 | ## 缺陷 5 | 1. 可以转成`scrapy`的`CrawlSpider`实现,但没做 6 | 2. 百度反爬技术:把评论的部分字换成图片,可通过图像识别解决,暂时没做 7 | 8 | ## api 9 | 网页`html`代码,无`json`或其他格式的`REST`接口 10 | -------------------------------------------------------------------------------- /TMallCommentSpider/TMallCommentSpider/spiders/config.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # @Author : Huangcc 3 | 4 | import urllib 5 | 6 | 7 | search_keyword = '美年健康' 8 | 9 | 10 | if __name__ == '__main__': 11 | print urllib.quote(search_keyword.decode('utf-8').encode('gbk')) 12 | -------------------------------------------------------------------------------- /OopSpider/oop/scrapy.cfg: -------------------------------------------------------------------------------- 1 | # Automatically created by: scrapy startproject 2 | # 3 | # For more information about the [deploy] section see: 4 | # https://scrapyd.readthedocs.org/en/latest/deploy.html 5 | 6 | [settings] 7 | default = oop.settings 8 | 9 | [deploy] 10 | #url = http://localhost:6800/ 11 | project = oop 12 | -------------------------------------------------------------------------------- /ZhenAiSpider/ZhenAi/scrapy.cfg: -------------------------------------------------------------------------------- 1 | # Automatically created by: scrapy startproject 2 | # 3 | # For more information about the [deploy] section see: 4 | # https://scrapyd.readthedocs.org/en/latest/deploy.html 5 | 6 | [settings] 7 | default = ZhenAi.settings 8 | 9 | [deploy] 10 | #url = http://localhost:6800/ 11 | project = ZhenAi 12 | -------------------------------------------------------------------------------- /TMallCommentSpider/scrapy.cfg: -------------------------------------------------------------------------------- 1 | # Automatically created by: scrapy startproject 2 | # 3 | # For more information about the [deploy] section see: 4 | # https://scrapyd.readthedocs.org/en/latest/deploy.html 5 | 6 | [settings] 7 | default = TMallCommentSpider.settings 8 | 9 | [deploy] 10 | #url = http://localhost:6800/ 11 | project = TMallCommentSpider 12 | -------------------------------------------------------------------------------- /TMallCommentSpider/TMallCommentSpider/pipelines.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define your item pipelines here 4 | # 5 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting 6 | # See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html 7 | 8 | 9 | class TmallcommentspiderPipeline(object): 10 | def process_item(self, item, spider): 11 | return item 12 | -------------------------------------------------------------------------------- /TMallCommentSpider/TMallCommentSpider/items.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define here the models for your scraped items 4 | # 5 | # See documentation in: 6 | # http://doc.scrapy.org/en/latest/topics/items.html 7 | 8 | import scrapy 9 | 10 | 11 | class TmallcommentspiderItem(scrapy.Item): 12 | # define the fields for your item here like: 13 | # name = scrapy.Field() 14 | pass 15 | -------------------------------------------------------------------------------- /TMallCommentSpider/tmall_comment_selenium_spider.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # @Author : Huangcc 3 | 4 | from selenium import webdriver 5 | 6 | browser = webdriver.Firefox() 7 | browser.get('https://list.tmall.com/search_product.htm?q=%C3%C0%C4%EA%BD%A1%BF%B5&type=p&vmarket=&spm=875.7931836%2FB.a2227oh.d100&from=mallfp..pc_1_searchbutton') 8 | html_source = browser.page_source 9 | print html_source 10 | browser.close() -------------------------------------------------------------------------------- /ZhenAiSpider/ZhenAi/ZhenAi/items.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define here the models for your scraped items 4 | # 5 | # See documentation in: 6 | # http://doc.scrapy.org/en/latest/topics/items.html 7 | 8 | import scrapy 9 | 10 | 11 | class ZhenaiItem(scrapy.Item): 12 | # define the fields for your item here like: 13 | # name = scrapy.Field() 14 | pic_url = scrapy.Field() # used for pipeline to download 15 | pass 16 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Spider 2 | Some spiders for some webs 3 | 4 | ## JDCommentSpider 5 | 京东商品评论定向爬虫 6 | 7 | 标签:`Python`, `requests` 8 | 9 | ## MouserSpider 10 | Mouser网站爬虫 11 | 12 | 标签: `Python`, `requests`, `selenium` 13 | 14 | ## OopSpider 15 | 面向对象 相亲网站爬虫 16 | 17 | 标签: `Python`, `scrapy` 18 | 19 | ## ZhenAiSpider 20 | 珍爱网 相亲网站爬虫 21 | 22 | 标签: `Python`, `scrapy` 23 | 24 | ## WeiboCommentSpider 25 | 新浪微博 搜索结果的评论爬虫 26 | 27 | 标签: `Python`, `requests` 28 | -------------------------------------------------------------------------------- /ZhenAiSpider/ZhenAi/ZhenAi/pipelines.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define your item pipelines here 4 | # 5 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting 6 | # See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html 7 | from scrapy.pipelines.images import ImagesPipeline 8 | from scrapy.http import Request 9 | 10 | 11 | class ZhenAiImagePipline(ImagesPipeline): 12 | def get_media_requests(self, item, info): 13 | for url in item['pic_url']: 14 | yield Request(url) 15 | -------------------------------------------------------------------------------- /ZhenAiSpider/ZhenAi/ZhenAi/spiders/utils.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | import re 4 | from itertools import product 5 | 6 | 7 | def get_brief_td_to_key_value(td): 8 | p = '(.*?)[::]?\s?(.*?)' 9 | result = re.findall(p, td) 10 | if len(result) == 1: 11 | return result[0][0].replace(u'\uff1a', ''), result[0][1] 12 | else: 13 | return None, None 14 | 15 | 16 | def get_info_td_to_key_value(td): 17 | p = '(.*?)[::]?\s?' 18 | result = re.findall(p, td) 19 | if len(result) == 2: 20 | return result[0].replace(u'\uff1a', ''), result[1] 21 | else: 22 | return None, None 23 | 24 | 25 | def url_generator(url, *args): 26 | for items in product(*args): 27 | yield url.format(*items) 28 | -------------------------------------------------------------------------------- /OopSpider/oop/oop/pipelines.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define your item pipelines here 4 | # 5 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting 6 | # See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html 7 | from scrapy.contrib.pipeline.images import ImagesPipeline 8 | from scrapy.http import Request 9 | 10 | 11 | class OopImagesPipeline(ImagesPipeline): 12 | 13 | def get_media_requests(self, item, info): 14 | for image_url in item['pic_url']: 15 | yield Request(image_url) 16 | 17 | def item_completed(self, results, item, info): 18 | # results - [(success, image_info_or_failure)] image_info - {url: x, path: x, checksum: x} 19 | pic_paths = [] 20 | for success, image_info_or_failure in results: 21 | if success: 22 | pic_paths.append(image_info_or_failure['path']) 23 | else: 24 | pic_paths.append([]) 25 | item['pic_path'] = pic_paths 26 | return item 27 | -------------------------------------------------------------------------------- /OopSpider/oop/oop/items.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define here the models for your scraped items 4 | # 5 | # See documentation in: 6 | # http://doc.scrapy.org/en/latest/topics/items.html 7 | 8 | import scrapy 9 | 10 | 11 | class OopItem(scrapy.Item): 12 | # define the fields for your item here like: 13 | # name = scrapy.Field() 14 | url = scrapy.Field() 15 | title = scrapy.Field() 16 | date = scrapy.Field() 17 | location = scrapy.Field() 18 | birth = scrapy.Field() 19 | tall = scrapy.Field() 20 | work_city = scrapy.Field() 21 | born_city = scrapy.Field() 22 | work = scrapy.Field() 23 | parent = scrapy.Field() 24 | only_child = scrapy.Field() 25 | rich = scrapy.Field() 26 | interest = scrapy.Field() 27 | distance_love = scrapy.Field() 28 | year_married = scrapy.Field() 29 | num_child = scrapy.Field() 30 | lowest_command = scrapy.Field() 31 | special_command = scrapy.Field() 32 | introduction = scrapy.Field() 33 | pic_url = scrapy.Field() 34 | pic_path = scrapy.Field() 35 | pass 36 | -------------------------------------------------------------------------------- /ZhenAiSpider/ZhenAi/ZhenAi/spiders/mymongo.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | import pymongo 4 | import traceback 5 | 6 | 7 | class MyMongo(object): 8 | def __init__(self, host='127.0.0.1', port=27017): 9 | self.client = None 10 | self.db = None 11 | try: 12 | self.client = pymongo.MongoClient(host=host, port=port) 13 | except: 14 | traceback.print_exc() 15 | self.client = None 16 | 17 | def get_db(self, db_name): 18 | if self.client: 19 | try: 20 | self.db = self.client[db_name] 21 | except: 22 | traceback.print_exc() 23 | 24 | def insert_doc(self, db_name, doc_name, doc_list): 25 | # data_dic - {data:[doc1, doc2...docN]} 26 | self.get_db(db_name) # get database to self.db 27 | if self.db is None: 28 | return False 29 | try: 30 | doc = self.db[doc_name] 31 | doc.insert(doc_list) 32 | except: 33 | traceback.print_exc() 34 | return False 35 | return True 36 | -------------------------------------------------------------------------------- /MouserSpider/myselenium.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf-8 -*- 2 | 3 | from selenium import webdriver 4 | import requests 5 | import sqlite3 6 | 7 | browser = webdriver.Firefox() 8 | browser.get('http://www.mouser.cn') 9 | html_source = browser.page_source 10 | print html_source 11 | 12 | coon = sqlite3.connect('/root/.mozilla/firefox/gmfs2ivm.default/cookies.sqlite') 13 | cursor = coon.cursor() 14 | cursor.execute('select name, value from moz_cookies where baseDomain="mouser.cn"') 15 | cookies = cursor.fetchall() 16 | coon.close() 17 | 18 | 19 | cookie=[item[0]+"="+item[1]for item in cookies] 20 | 21 | cookiestr=';'.join(item for item in cookie) 22 | 23 | print cookiestr 24 | 25 | myheaders = { 26 | 'Host': 'www.mouser.cn', 27 | 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64; rv:53.0) Gecko/20100101 Firefox/53.0', 28 | 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 29 | 'Accept-Language': 'zh-CN,zh;q=0.8,en-US;q=0.5,en;q=0.3', 30 | 'Accept-Encoding': 'gzip, deflate', 31 | 'Upgrade-Insecure-Requests': '1', 32 | 'If-None-Match': "76b9f323a7b0ec42447e8435c1bc98bd", 33 | 'Cache-Control': 'max-age=0', 34 | 'Cookie':cookiestr 35 | } 36 | 37 | s = requests.session() 38 | #r = s.get('http://www.mouser.cn/Semiconductors/RF-Semiconductors/_/N-96p9c/', headers=myheaders) 39 | r = s.get('http://www.mouser.cn/Semiconductors/RF-Semiconductors/_/N-96p9c/', headers=myheaders) 40 | 41 | data = r.content 42 | 43 | f = open('data.html', 'w') 44 | f.write(data) 45 | f.close() 46 | 47 | browser.close() 48 | -------------------------------------------------------------------------------- /OopSpider/oop/oop/middlewares.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define here the models for your spider middleware 4 | # 5 | # See documentation in: 6 | # http://doc.scrapy.org/en/latest/topics/spider-middleware.html 7 | 8 | from scrapy import signals 9 | 10 | 11 | class OopSpiderMiddleware(object): 12 | # Not all methods need to be defined. If a method is not defined, 13 | # scrapy acts as if the spider middleware does not modify the 14 | # passed objects. 15 | 16 | @classmethod 17 | def from_crawler(cls, crawler): 18 | # This method is used by Scrapy to create your spiders. 19 | s = cls() 20 | crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) 21 | return s 22 | 23 | def process_spider_input(response, spider): 24 | # Called for each response that goes through the spider 25 | # middleware and into the spider. 26 | 27 | # Should return None or raise an exception. 28 | return None 29 | 30 | def process_spider_output(response, result, spider): 31 | # Called with the results returned from the Spider, after 32 | # it has processed the response. 33 | 34 | # Must return an iterable of Request, dict or Item objects. 35 | for i in result: 36 | yield i 37 | 38 | def process_spider_exception(response, exception, spider): 39 | # Called when a spider or process_spider_input() method 40 | # (from other spider middleware) raises an exception. 41 | 42 | # Should return either None or an iterable of Response, dict 43 | # or Item objects. 44 | pass 45 | 46 | def process_start_requests(start_requests, spider): 47 | # Called with the start requests of the spider, and works 48 | # similarly to the process_spider_output() method, except 49 | # that it doesn’t have a response associated. 50 | 51 | # Must return only requests (not items). 52 | for r in start_requests: 53 | yield r 54 | 55 | def spider_opened(self, spider): 56 | spider.logger.info('Spider opened: %s' % spider.name) 57 | -------------------------------------------------------------------------------- /ZhenAiSpider/ZhenAi/ZhenAi/middlewares.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define here the models for your spider middleware 4 | # 5 | # See documentation in: 6 | # http://doc.scrapy.org/en/latest/topics/spider-middleware.html 7 | 8 | from scrapy import signals 9 | 10 | 11 | class ZhenaiSpiderMiddleware(object): 12 | # Not all methods need to be defined. If a method is not defined, 13 | # scrapy acts as if the spider middleware does not modify the 14 | # passed objects. 15 | 16 | @classmethod 17 | def from_crawler(cls, crawler): 18 | # This method is used by Scrapy to create your spiders. 19 | s = cls() 20 | crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) 21 | return s 22 | 23 | def process_spider_input(response, spider): 24 | # Called for each response that goes through the spider 25 | # middleware and into the spider. 26 | 27 | # Should return None or raise an exception. 28 | return None 29 | 30 | def process_spider_output(response, result, spider): 31 | # Called with the results returned from the Spider, after 32 | # it has processed the response. 33 | 34 | # Must return an iterable of Request, dict or Item objects. 35 | for i in result: 36 | yield i 37 | 38 | def process_spider_exception(response, exception, spider): 39 | # Called when a spider or process_spider_input() method 40 | # (from other spider middleware) raises an exception. 41 | 42 | # Should return either None or an iterable of Response, dict 43 | # or Item objects. 44 | pass 45 | 46 | def process_start_requests(start_requests, spider): 47 | # Called with the start requests of the spider, and works 48 | # similarly to the process_spider_output() method, except 49 | # that it doesn’t have a response associated. 50 | 51 | # Must return only requests (not items). 52 | for r in start_requests: 53 | yield r 54 | 55 | def spider_opened(self, spider): 56 | spider.logger.info('Spider opened: %s' % spider.name) 57 | -------------------------------------------------------------------------------- /TMallCommentSpider/TMallCommentSpider/middlewares.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define here the models for your spider middleware 4 | # 5 | # See documentation in: 6 | # http://doc.scrapy.org/en/latest/topics/spider-middleware.html 7 | 8 | from scrapy import signals 9 | 10 | 11 | class TmallcommentspiderSpiderMiddleware(object): 12 | # Not all methods need to be defined. If a method is not defined, 13 | # scrapy acts as if the spider middleware does not modify the 14 | # passed objects. 15 | 16 | @classmethod 17 | def from_crawler(cls, crawler): 18 | # This method is used by Scrapy to create your spiders. 19 | s = cls() 20 | crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) 21 | return s 22 | 23 | def process_spider_input(self, response, spider): 24 | # Called for each response that goes through the spider 25 | # middleware and into the spider. 26 | 27 | # Should return None or raise an exception. 28 | return None 29 | 30 | def process_spider_output(self, response, result, spider): 31 | # Called with the results returned from the Spider, after 32 | # it has processed the response. 33 | 34 | # Must return an iterable of Request, dict or Item objects. 35 | for i in result: 36 | yield i 37 | 38 | def process_spider_exception(self, response, exception, spider): 39 | # Called when a spider or process_spider_input() method 40 | # (from other spider middleware) raises an exception. 41 | 42 | # Should return either None or an iterable of Response, dict 43 | # or Item objects. 44 | pass 45 | 46 | def process_start_requests(self, start_requests, spider): 47 | # Called with the start requests of the spider, and works 48 | # similarly to the process_spider_output() method, except 49 | # that it doesn’t have a response associated. 50 | 51 | # Must return only requests (not items). 52 | for r in start_requests: 53 | yield r 54 | 55 | def spider_opened(self, spider): 56 | spider.logger.info('Spider opened: %s' % spider.name) 57 | -------------------------------------------------------------------------------- /OopSpider/oop/oop/spiders/oop_spider.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | import scrapy 4 | from oop.items import OopItem 5 | from scrapy.contrib.spiders import CrawlSpider 6 | from scrapy.contrib.spiders import Rule 7 | from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor 8 | from scrapy.utils.response import get_base_url 9 | 10 | 11 | class OopSpider(CrawlSpider): 12 | name = 'OopSpider' 13 | allowed_domains = ['date.jobbole.com'] 14 | start_urls = ['http://date.jobbole.com'] 15 | rules = [ 16 | Rule(SgmlLinkExtractor(allow=('/page/\d{1,3}/')), follow=True, callback='parse_item') 17 | ] 18 | index_flag = True 19 | 20 | def parse_item(self, response): 21 | # add front page search 22 | if OopSpider.index_flag: 23 | OopSpider.index_flag = False 24 | yield scrapy.Request(OopSpider.start_urls[0], callback=self.parse_item) 25 | 26 | html_selector = scrapy.Selector(response) 27 | urls = html_selector.xpath('//li[@class="media"]/div/h3/a/@href').extract() 28 | for url in urls: 29 | yield scrapy.Request(url, callback=self.parse_detail_item) 30 | 31 | def parse_detail_item(self, response): 32 | items = OopItem() 33 | html_selector = scrapy.Selector(response) 34 | 35 | items['url'] = get_base_url(response) 36 | 37 | head_prefix = '//div[@class="p-single"]' 38 | items['title'] = html_selector.xpath(head_prefix + '//h1/text()').extract() 39 | items['date'] = html_selector.xpath(head_prefix + '//p[@class="p-meta"]/span[1]/text()').extract() 40 | items['location'] = html_selector.xpath(head_prefix + '//p[@class="p-meta"]/span[2]/a/text()').extract() 41 | 42 | detail_prefix = '//div[@class="p-entry"]' 43 | details = html_selector.xpath(detail_prefix + '/p/text()').extract() 44 | details = map(lambda x: x.replace('\n', ''), details) 45 | items['birth'] = details[0] 46 | items['tall'] = details[1] 47 | items['work_city'] = details[2] 48 | items['born_city'] = details[3] 49 | items['work'] = details[4] 50 | items['parent'] = details[5] 51 | items['only_child'] = details[6] 52 | items['rich'] = details[7] 53 | items['interest'] = details[8] 54 | items['distance_love'] = details[9] 55 | items['year_married'] = details[10] 56 | items['num_child'] = details[11] 57 | items['lowest_command'] = details[12] 58 | items['special_command'] = details[13] 59 | items['introduction'] = details[14] 60 | items['pic_url'] = html_selector.xpath(detail_prefix + '/p/img/@src').extract() 61 | return items 62 | -------------------------------------------------------------------------------- /WeiboCommentSpider/README.md: -------------------------------------------------------------------------------- 1 | # WeiboCommentSpider 2 | 微博评论爬虫 3 | 4 | ## 工作原理 5 | 使用微博登录接口进行模拟登陆获取`cookies`后,在`m.weibo.cn`上进行为所欲为地搜索和评论爬取 6 | 7 | ## 缺陷 8 | 1. 只能取最新的1000条微博搜索结果 9 | 2. 串行版 10 | 3. 以`无BOM的utf-8`进行编码,得到的csv文件无法在`Microsoft Office Excel`直接打开,建议用`Notepad++`转成`带BOM的utf-8编码` 11 | 12 | ## 登录api 13 | ### 实现 14 | ```python 15 | def login(self): 16 | user = self.get_b64_username() # 用户名先进行urlencode然后base64编码 17 | passwd = self.get_rsa_password(self.pubkey, self.nonce, self.servertime) 18 | self.login_params['su'] = user 19 | self.login_params['servertime'] = self.servertime 20 | self.login_params['nonce'] = self.nonce 21 | self.login_params['rsakv'] = self.rsakv 22 | self.login_params['sp'] = passwd 23 | 24 | resp = self.session.post(self.login_url, data=self.login_params, headers=self.login_headers) 25 | if 'retcode%3D0' in resp.content: 26 | print 'login success' 27 | return True 28 | print 'login fail' 29 | print resp.content 30 | return False 31 | ``` 32 | ### 原理 33 | 先从`https://login.sina.com.cn/sso/prelogin.php`获得加密所需的参数`pubkey`, `servertime`, `nonce`, `rsakv`, 34 | 然后对密码进行rsa2加密,js加密代码如下,最后发送到服务器登录接口进行登录 35 | ```javascript 36 | var RSAKey = new sinaSSOEncoder.RSAKey(); 37 | RSAKey.setPublic(me.rsaPubkey, "10001"); 38 | password = RSAKey.encrypt([me.servertime, me.nonce].join("\t") + "\n" + password) 39 | ``` 40 | 41 | ## 关于微博和评论api 42 | `m.weibo.cn`是一个神奇的入口,登录使用的是明文的`password`,接口返回的数据是`json`格式, 43 | 但是只能获取按时间排序的`1000`条微博。而`weibo.cn`则使用`rsa2`加密的密码,数据加载方式使用令人窒息的`ajax`的动态加载, 44 | 返回的数据是`html`/`js`文本。由于时间成本,暂用`m.weibo.cn`入口,后续有需要再继续分析 45 | 46 | ## 微博api 47 | ### headers 48 | ```python 49 | self.search_headers = { 50 | 'Host': 'm.weibo.cn', 51 | 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64; rv:52.0) Gecko/20100101 Firefox/52.0', 52 | 'Accept': 'application/json, text/plain, */*', 53 | 'Accept-Language': 'zh-CN,zh;q=0.8,en-US;q=0.5,en;q=0.3', 54 | 'Accept-Encoding': 'gzip, deflate, br', 55 | 'X-Requested-With': 'XMLHttpRequest', 56 | 'Referer': '', 57 | } 58 | self.search_referer = 'https://m.weibo.cn/p/100103type%3D2%26q%3D{keyword}?type=wb&queryVal={keyword}' \ 59 | '&featurecode=20000320&luicode=10000011&lfid=106003type%3D1&title={keyword}' 60 | ``` 61 | ### params 62 | ```python 63 | self.search_data = { 64 | 'type': 'wb', 65 | 'queryVal': '{keyword}', 66 | 'featurecode': '20000320', 67 | 'luicode': '10000011', 68 | 'lfid': '106003type=1', 69 | 'title': '{keyword}', 70 | 'containerid': '100103type=2&q={keyword}', 71 | } 72 | ``` 73 | ### response 74 | 详见[weibo_search_result_example.json](weibo_search_result_example.json) 75 | 76 | ## 评论api 77 | ### headers 78 | `Referer`校验 79 | ```python 80 | self.comment_headers = { 81 | 'Host': 'm.weibo.cn', 82 | 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64; rv:52.0) Gecko/20100101 Firefox/52.0', 83 | 'Accept': 'application/json, text/plain, */*', 84 | 'Accept-Language': 'zh-CN,zh;q=0.8,en-US;q=0.5,en;q=0.3', 85 | 'Accept-Encoding': 'gzip, deflate, br', 86 | 'X-Requested-With': 'XMLHttpRequest', 87 | # 'Referer': 'https://m.weibo.cn/status/4172328575544621', 88 | 'Connection': 'keep-alive', 89 | } 90 | ``` 91 | ### params 92 | ```python 93 | self.comment_data = { 94 | 'id': '{id}', # 微博id 95 | 'page': '{page}', # 第x页 96 | } 97 | ``` 98 | ### response 99 | 详见[weibo_comment_result_example.json](weibo_comment_result_example.json) -------------------------------------------------------------------------------- /OopSpider/oop/oop/settings.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Scrapy settings for oop project 4 | # 5 | # For simplicity, this file contains only settings considered important or 6 | # commonly used. You can find more settings consulting the documentation: 7 | # 8 | # http://doc.scrapy.org/en/latest/topics/settings.html 9 | # http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html 10 | # http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html 11 | 12 | BOT_NAME = 'oop' 13 | 14 | SPIDER_MODULES = ['oop.spiders'] 15 | NEWSPIDER_MODULE = 'oop.spiders' 16 | 17 | 18 | # Crawl responsibly by identifying yourself (and your website) on the user-agent 19 | #USER_AGENT = 'oop (+http://www.yourdomain.com)' 20 | 21 | # Obey robots.txt rules 22 | ROBOTSTXT_OBEY = False 23 | 24 | # Configure maximum concurrent requests performed by Scrapy (default: 16) 25 | #CONCURRENT_REQUESTS = 32 26 | 27 | # Configure a delay for requests for the same website (default: 0) 28 | # See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay 29 | # See also autothrottle settings and docs 30 | #DOWNLOAD_DELAY = 3 31 | # The download delay setting will honor only one of: 32 | #CONCURRENT_REQUESTS_PER_DOMAIN = 16 33 | #CONCURRENT_REQUESTS_PER_IP = 16 34 | 35 | # Disable cookies (enabled by default) 36 | #COOKIES_ENABLED = False 37 | 38 | # Disable Telnet Console (enabled by default) 39 | #TELNETCONSOLE_ENABLED = False 40 | 41 | # Override the default request headers: 42 | #DEFAULT_REQUEST_HEADERS = { 43 | # 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 44 | # 'Accept-Language': 'en', 45 | #} 46 | 47 | # Enable or disable spider middlewares 48 | # See http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html 49 | #SPIDER_MIDDLEWARES = { 50 | # 'oop.middlewares.OopSpiderMiddleware': 543, 51 | #} 52 | 53 | # Enable or disable downloader middlewares 54 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html 55 | #DOWNLOADER_MIDDLEWARES = { 56 | # 'oop.middlewares.MyCustomDownloaderMiddleware': 543, 57 | #} 58 | 59 | # Enable or disable extensions 60 | # See http://scrapy.readthedocs.org/en/latest/topics/extensions.html 61 | #EXTENSIONS = { 62 | # 'scrapy.extensions.telnet.TelnetConsole': None, 63 | #} 64 | 65 | # Configure item pipelines 66 | # See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html 67 | ITEM_PIPELINES = { 68 | 'oop.pipelines.OopImagesPipeline': 300, 69 | } 70 | 71 | # ImagePipeline Setting 72 | IMAGES_STORE = './oop/pic' 73 | IMAGES_EXPIRES = 90 74 | 75 | # Enable and configure the AutoThrottle extension (disabled by default) 76 | # See http://doc.scrapy.org/en/latest/topics/autothrottle.html 77 | #AUTOTHROTTLE_ENABLED = True 78 | # The initial download delay 79 | #AUTOTHROTTLE_START_DELAY = 5 80 | # The maximum download delay to be set in case of high latencies 81 | #AUTOTHROTTLE_MAX_DELAY = 60 82 | # The average number of requests Scrapy should be sending in parallel to 83 | # each remote server 84 | #AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0 85 | # Enable showing throttling stats for every response received: 86 | #AUTOTHROTTLE_DEBUG = False 87 | 88 | # Enable and configure HTTP caching (disabled by default) 89 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings 90 | #HTTPCACHE_ENABLED = True 91 | #HTTPCACHE_EXPIRATION_SECS = 0 92 | #HTTPCACHE_DIR = 'httpcache' 93 | #HTTPCACHE_IGNORE_HTTP_CODES = [] 94 | #HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage' 95 | -------------------------------------------------------------------------------- /TMallCommentSpider/TMallCommentSpider/settings.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Scrapy settings for TMallCommentSpider project 4 | # 5 | # For simplicity, this file contains only settings considered important or 6 | # commonly used. You can find more settings consulting the documentation: 7 | # 8 | # http://doc.scrapy.org/en/latest/topics/settings.html 9 | # http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html 10 | # http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html 11 | 12 | BOT_NAME = 'TMallCommentSpider' 13 | 14 | SPIDER_MODULES = ['TMallCommentSpider.spiders'] 15 | NEWSPIDER_MODULE = 'TMallCommentSpider.spiders' 16 | 17 | 18 | # Crawl responsibly by identifying yourself (and your website) on the user-agent 19 | #USER_AGENT = 'TMallCommentSpider (+http://www.yourdomain.com)' 20 | 21 | # Obey robots.txt rules 22 | ROBOTSTXT_OBEY = False 23 | 24 | # Configure maximum concurrent requests performed by Scrapy (default: 16) 25 | #CONCURRENT_REQUESTS = 32 26 | 27 | # Configure a delay for requests for the same website (default: 0) 28 | # See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay 29 | # See also autothrottle settings and docs 30 | #DOWNLOAD_DELAY = 3 31 | # The download delay setting will honor only one of: 32 | #CONCURRENT_REQUESTS_PER_DOMAIN = 16 33 | #CONCURRENT_REQUESTS_PER_IP = 16 34 | 35 | # Disable cookies (enabled by default) 36 | #COOKIES_ENABLED = False 37 | 38 | # Disable Telnet Console (enabled by default) 39 | #TELNETCONSOLE_ENABLED = False 40 | 41 | # Override the default request headers: 42 | #DEFAULT_REQUEST_HEADERS = { 43 | # 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 44 | # 'Accept-Language': 'en', 45 | #} 46 | DEFAULT_REQUEST_HEADERS = { 47 | 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:57.0) Gecko/20100101 Firefox/57.0', 48 | 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 49 | 'Accept-Language': 'en-US,en;q=0.5', 50 | 'Accept-Encoding': 'gzip, deflate, br', 51 | 'Connection': 'keep-alive', 52 | 'Upgrade-Insecure-Requests': '1', 53 | } 54 | 55 | # Enable or disable spider middlewares 56 | # See http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html 57 | #SPIDER_MIDDLEWARES = { 58 | # 'TMallCommentSpider.middlewares.TmallcommentspiderSpiderMiddleware': 543, 59 | #} 60 | 61 | # Enable or disable downloader middlewares 62 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html 63 | #DOWNLOADER_MIDDLEWARES = { 64 | # 'TMallCommentSpider.middlewares.MyCustomDownloaderMiddleware': 543, 65 | #} 66 | 67 | # Enable or disable extensions 68 | # See http://scrapy.readthedocs.org/en/latest/topics/extensions.html 69 | #EXTENSIONS = { 70 | # 'scrapy.extensions.telnet.TelnetConsole': None, 71 | #} 72 | 73 | # Configure item pipelines 74 | # See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html 75 | #ITEM_PIPELINES = { 76 | # 'TMallCommentSpider.pipelines.TmallcommentspiderPipeline': 300, 77 | #} 78 | 79 | # Enable and configure the AutoThrottle extension (disabled by default) 80 | # See http://doc.scrapy.org/en/latest/topics/autothrottle.html 81 | #AUTOTHROTTLE_ENABLED = True 82 | # The initial download delay 83 | #AUTOTHROTTLE_START_DELAY = 5 84 | # The maximum download delay to be set in case of high latencies 85 | #AUTOTHROTTLE_MAX_DELAY = 60 86 | # The average number of requests Scrapy should be sending in parallel to 87 | # each remote server 88 | #AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0 89 | # Enable showing throttling stats for every response received: 90 | #AUTOTHROTTLE_DEBUG = False 91 | 92 | # Enable and configure HTTP caching (disabled by default) 93 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings 94 | #HTTPCACHE_ENABLED = True 95 | #HTTPCACHE_EXPIRATION_SECS = 0 96 | #HTTPCACHE_DIR = 'httpcache' 97 | #HTTPCACHE_IGNORE_HTTP_CODES = [] 98 | #HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage' 99 | -------------------------------------------------------------------------------- /ZhenAiSpider/ZhenAi/ZhenAi/settings.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Scrapy settings for ZhenAi project 4 | # 5 | # For simplicity, this file contains only settings considered important or 6 | # commonly used. You can find more settings consulting the documentation: 7 | # 8 | # http://doc.scrapy.org/en/latest/topics/settings.html 9 | # http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html 10 | # http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html 11 | 12 | BOT_NAME = 'ZhenAi' 13 | 14 | SPIDER_MODULES = ['ZhenAi.spiders'] 15 | NEWSPIDER_MODULE = 'ZhenAi.spiders' 16 | 17 | 18 | # Crawl responsibly by identifying yourself (and your website) on the user-agent 19 | #USER_AGENT = 'ZhenAi (+http://www.yourdomain.com)' 20 | 21 | # Obey robots.txt rules 22 | ROBOTSTXT_OBEY = False 23 | 24 | # Configure maximum concurrent requests performed by Scrapy (default: 16) 25 | #CONCURRENT_REQUESTS = 32 26 | 27 | # Configure a delay for requests for the same website (default: 0) 28 | # See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay 29 | # See also autothrottle settings and docs 30 | #DOWNLOAD_DELAY = 3 31 | # The download delay setting will honor only one of: 32 | #CONCURRENT_REQUESTS_PER_DOMAIN = 16 33 | #CONCURRENT_REQUESTS_PER_IP = 16 34 | 35 | # Disable cookies (enabled by default) 36 | #COOKIES_ENABLED = False 37 | 38 | # Disable Telnet Console (enabled by default) 39 | #TELNETCONSOLE_ENABLED = False 40 | 41 | # Override the default request headers: 42 | DEFAULT_REQUEST_HEADERS = { 43 | 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:54.0) Gecko/20100101 Firefox/54.0', 44 | 'Accept': 'application/json, text/javascript, */*; q=0.01', 45 | 'Accept-Language': 'zh-CN,zh;q=0.8,en-US;q=0.5,en;q=0.3', 46 | 'Accept-Encoding': 'gzip, deflate', 47 | 'X-Requested-With': 'XMLHttpRequest', 48 | 'Referer': 'http://search.zhenai.com/v2/search/pinterest.do?' 49 | 'sex=1&agebegin=18&ageend=-1&workcityprovince=-1&workcitycity=-1' 50 | '&info=&h1=-1&h2=-1&salaryBegin=-1&salaryEnd=-1&occupation=-1&h=-1' 51 | '&c=-1&workcityprovince1=-1&workcitycity1=-1&constellation=-1&animals=-1' 52 | '&stock=-1&belief=-1&lvBegin=-1&lvEnd=-1&condition=66&orderby=hpf&hotIndex=&online=', 53 | } 54 | 55 | # Enable or disable spider middlewares 56 | # See http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html 57 | #SPIDER_MIDDLEWARES = { 58 | # 'ZhenAi.middlewares.ZhenaiSpiderMiddleware': 543, 59 | #} 60 | 61 | # Enable or disable downloader middlewares 62 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html 63 | #DOWNLOADER_MIDDLEWARES = { 64 | # 'ZhenAi.middlewares.MyCustomDownloaderMiddleware': 543, 65 | #} 66 | 67 | # Enable or disable extensions 68 | # See http://scrapy.readthedocs.org/en/latest/topics/extensions.html 69 | #EXTENSIONS = { 70 | # 'scrapy.extensions.telnet.TelnetConsole': None, 71 | #} 72 | 73 | # Configure item pipelines 74 | # See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html 75 | ITEM_PIPELINES = { 76 | 'ZhenAi.pipelines.ZhenAiImagePipline': 300, 77 | } 78 | IMAGES_STORE = 'pic' 79 | IMAGES_EXPIRES = 90 80 | 81 | # Enable and configure the AutoThrottle extension (disabled by default) 82 | # See http://doc.scrapy.org/en/latest/topics/autothrottle.html 83 | #AUTOTHROTTLE_ENABLED = True 84 | # The initial download delay 85 | #AUTOTHROTTLE_START_DELAY = 5 86 | # The maximum download delay to be set in case of high latencies 87 | #AUTOTHROTTLE_MAX_DELAY = 60 88 | # The average number of requests Scrapy should be sending in parallel to 89 | # each remote server 90 | #AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0 91 | # Enable showing throttling stats for every response received: 92 | #AUTOTHROTTLE_DEBUG = False 93 | 94 | # Enable and configure HTTP caching (disabled by default) 95 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings 96 | #HTTPCACHE_ENABLED = True 97 | #HTTPCACHE_EXPIRATION_SECS = 0 98 | #HTTPCACHE_DIR = 'httpcache' 99 | #HTTPCACHE_IGNORE_HTTP_CODES = [] 100 | #HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage' 101 | -------------------------------------------------------------------------------- /TMallCommentSpider/TMallCommentSpider/spiders/tmall_comment_spider.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # @Author : Huangcc 3 | 4 | import scrapy 5 | import config 6 | import urllib 7 | import re 8 | import json 9 | import time 10 | import random 11 | import requests 12 | import codecs 13 | import datetime 14 | 15 | 16 | class TmallCommentSpider(scrapy.Spider): 17 | name = 'tmall_comment_spider' 18 | keyword = urllib.quote(config.search_keyword.decode('utf-8').encode('gbk')) 19 | search_url = 'https://list.tmall.com/search_product.htm?q={keyword}&type=p&vmarket=' \ 20 | '&spm=875.7931836%2FB.a2227oh.d100&from=mallfp..pc_1_searchbutton'.format(keyword=keyword) 21 | item_url = 'https://list.tmall.com/search_product.htm?spm=a220m.1000858.0.0.50092370Go9Qa5' \ 22 | '&s={start_index}&q={keyword}&sort=s&style=g&from=.list.pc_1_searchbutton' \ 23 | '&type=pc#J_Filter' 24 | comment_url = 'https://rate.tmall.com/list_detail_rate.htm' 25 | 26 | my_headers = { 27 | 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:57.0) Gecko/20100101 Firefox/57.0', 28 | 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 29 | 'Accept-Language': 'en-US,en;q=0.5', 30 | 'Accept-Encoding': 'gzip, deflate, br', 31 | } 32 | 33 | start_urls = [search_url] 34 | 35 | filename = 'tmall-comments-%s.csv' % (datetime.datetime.now().strftime('%Y%m%d-%H%M')) 36 | 37 | def parse(self, response): 38 | total_page = response.xpath('//input[@name="totalPage"]/@value').extract_first() 39 | if total_page: 40 | total_page = int(total_page) 41 | for page in range(total_page): 42 | start_index = page * 60 43 | page_url = self.item_url.format(start_index=start_index, keyword=self.keyword) 44 | yield scrapy.Request(url=page_url, method='GET', callback=self.parse_search_result, 45 | headers=self.my_headers) 46 | 47 | def parse_search_result(self, response): 48 | item_urls = response.xpath('//p[@class="productStatus"]//a/@href').extract() 49 | comment_nums = response.xpath('//p[@class="productStatus"]//a/text()').extract() 50 | for i in range(len(item_urls)): 51 | if int(comment_nums[i]): 52 | yield scrapy.Request(url='https:' + item_urls[i], method='GET', callback=self.parse_item, 53 | headers=self.my_headers) 54 | 55 | def parse_item(self, response): 56 | resp_text = response.text.replace('\n', '') 57 | pattern = 'TShop\.Setup\((.*?)\);' 58 | result = re.findall(pattern, resp_text) 59 | 60 | data = { 61 | 'itemId': '', # 必填 62 | 'spuId': '', # 必填 63 | 'sellerId': '', # 必填 64 | 'order': '3', 65 | 'currentPage': '1', # 页 66 | 'append': '0', 67 | 'content': '1', 68 | 'tagId': '', 69 | 'posi': '', 70 | 'picture': '', 71 | 'ua': '', 72 | 'needFold': '0', 73 | '_ksTS': '', # 毫秒时间戳_四位随机数 74 | 'callback': '', # jsonp_四位随机数+1 75 | } 76 | 77 | if result: 78 | json_data = json.loads(result[0]) 79 | item_do = json_data['itemDO'] 80 | 81 | data['itemId'] = item_do['itemId'] 82 | data['spuId'] = item_do['spuId'] 83 | data['sellerId'] = item_do['userId'] 84 | random_int = random.randint(1000, 9998) 85 | data['_ksTS'] = '%d_%d' % (time.time() * 1000, random_int), # 毫秒时间戳_四位随机数 86 | data['callback'] = 'jsonp_%d' % (random_int + 1) 87 | 88 | my_headers = self.my_headers.copy() 89 | my_headers['Host'] = 'rate.tmall.com' 90 | my_headers['Referer'] = response.url 91 | 92 | resp = requests.get(self.comment_url, params=data, headers=my_headers) 93 | json_str = resp.content[len(data['callback']):-1] 94 | json_data = json.loads(json_str) 95 | max_pages = json_data['rateDetail']['paginator']['lastPage'] 96 | for i in range(max_pages): 97 | random_int = random.randint(1000, 9998) 98 | data['_ksTS'] = '%d_%d' % (time.time() * 1000, random_int), # 毫秒时间戳_四位随机数 99 | data['callback'] = 'jsonp_%d' % (random_int + 1) 100 | data['currentPage'] = str(i + 1) 101 | 102 | url = self.comment_url + '?' + urllib.urlencode(data) 103 | yield scrapy.Request(url=url, headers=my_headers, callback=self.parse_comment) 104 | 105 | def parse_comment(self, response): 106 | json_data = json.loads(response.text[len('jsonp_9999'):-1]) 107 | rate_list = json_data['rateDetail']['rateList'] 108 | for rate in rate_list: 109 | user_nickname = rate['displayUserNick'] 110 | user_id = rate['id'] 111 | rate_content = rate['rateContent'] 112 | rate_date = rate['rateDate'] 113 | 114 | with codecs.open(self.filename, 'a') as f: 115 | f.write('|'.join((user_id, user_nickname, rate_content, rate_date)) + '\n') 116 | -------------------------------------------------------------------------------- /BaiduZhidaoCommentSpider/baidu_zhidao_comment_spider.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # @Author : Huangcc 3 | 4 | import requests 5 | import urllib 6 | from lxml import etree 7 | import re 8 | import datetime 9 | import codecs 10 | import time 11 | import random 12 | 13 | SLEEP = [0.5, 1, 1.5, 2, 2.5, 3] 14 | 15 | 16 | class BaiduZhidao(): 17 | search_url = 'https://zhidao.baidu.com/search?word={keyword}&ie=gbk&site=-1&sites=0&date=0&pn=PAGE' 18 | my_headers = { 19 | 'Host': 'zhidao.baidu.com', 20 | 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:57.0) Gecko/20100101 Firefox/57.0', 21 | 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 22 | 'Accept-Language': 'en-US,en;q=0.5', 23 | 'Accept-Encoding': 'gzip, deflate, br', 24 | } 25 | comment_url = 'https://zhidao.baidu.com/question/{question_id}.html?sort=9&rn=5&pn=PAGE#wgt-answers' 26 | 27 | def __init__(self, keyword): 28 | self.session = requests.Session() 29 | self.keyword = keyword 30 | self.search_url = self.search_url.format(keyword=urllib.quote(keyword.decode('utf-8').encode('gbk'))) 31 | self.question_ids = [] 32 | self.filename = 'baidu_zhidao-comments-%s.csv' % (datetime.datetime.now().strftime('%Y%m%d-%H%M')) 33 | 34 | def set_keyword(self, keyword): 35 | self.keyword = keyword 36 | 37 | def reset_filename(self): 38 | self.filename = 'baidu_zhidao-comments-%s.csv' % (datetime.datetime.now().strftime('%Y%m%d-%H%M')) 39 | 40 | @staticmethod 41 | def extract_question_id(url): 42 | pattern = '/question/(\d+?)\.' 43 | result = re.findall(pattern, url) 44 | if result: 45 | return result[0] 46 | else: 47 | return None 48 | 49 | @staticmethod 50 | def html_filter(html_text): 51 | html_text = html_text.replace('\n', '').replace('\t', ' ') 52 | pattern = re.compile(r'<[^>]+>', re.S) 53 | no_html_text = pattern.sub('', html_text) 54 | return no_html_text 55 | 56 | def search(self, page=0): 57 | print '-*- start search with page %d -*-' % (page / 10 + 1) 58 | time.sleep(SLEEP[random.randint(0, len(SLEEP) - 1)]) 59 | resp = self.session.get(url=self.search_url.replace('PAGE', str(page)), headers=self.my_headers) 60 | if resp.status_code == 200: 61 | response = etree.HTML(resp.text) 62 | urls = response.xpath('//a[@class="ti"]/@href') 63 | self.question_ids.extend(filter(lambda x: True if x else False, map(self.extract_question_id, urls))) 64 | 65 | next_page = response.xpath('//a[@class="pager-next"]/@href') 66 | if next_page: 67 | next_page_number = re.findall('&pn=(\d+)$', next_page[0]) 68 | if next_page_number: 69 | next_page_number = int(next_page_number[0]) 70 | else: 71 | next_page_number = 0 72 | self.search(page=next_page_number) # 递归调用直到没有下一页 73 | else: 74 | print '=*= end search with page %d =*=' % (page / 10 + 1) 75 | else: 76 | print 'Error status code %d in getting search result with page %d' % (resp.status_code, (page / 10 + 1)) 77 | print resp.content 78 | 79 | def print_question_ids(self): 80 | print self.question_ids 81 | 82 | def find_comments(self): 83 | total = len(self.question_ids) 84 | for i, question_id in enumerate(self.question_ids): 85 | print '|*| start get content from question id %s - %d/%d |*|' % (question_id, i + 1, total) 86 | url = self.comment_url.format(question_id=question_id) 87 | self.comment(url) 88 | print '_*_ end get content from question id %s - %d/%d _*_' % (question_id, i + 1, total) 89 | 90 | def comment(self, url, page=0): 91 | print ' * start get comments with page %d *' % (page / 5 + 1) 92 | time.sleep(SLEEP[random.randint(0, len(SLEEP) - 1)]) 93 | resp = self.session.get(url.replace('PAGE', str(page)), headers=self.my_headers, allow_redirects=False) 94 | if resp.status_code != 200: 95 | print 'Error status code %d in getting comment result with page %d' % (resp.status_code, (page / 5 + 1)) 96 | print resp.content 97 | else: 98 | response = etree.HTML(resp.content) 99 | comment_nodes = response.xpath('//span[@class="con"]') 100 | comments = [] 101 | for node in comment_nodes: 102 | print node.xpath('string(.)') 103 | comments.append(node.xpath('string(.)').strip()) 104 | print ' | get %d comments | ' % len(comments) 105 | 106 | # 获取问题 107 | ask_title = response.xpath('//title/text()') 108 | if ask_title: 109 | ask_title = ask_title[0] 110 | else: 111 | ask_title = "" 112 | 113 | if comments: 114 | comments = map(self.html_filter, comments) 115 | with codecs.open(self.filename, 'a', encoding='utf-8') as f: 116 | for data in comments: 117 | f.write(ask_title + '|' + data + '\n') 118 | 119 | next_page = response.xpath('//a[@class="pager-next"]/@href') 120 | if next_page: 121 | next_page_number = re.findall('&pn=(\d+)#', next_page[0]) 122 | if next_page_number: 123 | next_page_number = int(next_page_number[0]) 124 | else: 125 | next_page_number = 0 126 | self.comment(url, next_page_number) # 递归调用直到没有下一页 127 | else: 128 | print ' - end get comments with page %d -' % (page / 5 + 1) 129 | 130 | 131 | if __name__ == '__main__': 132 | baidu_zhidao = BaiduZhidao('美年大健康') 133 | baidu_zhidao.search() 134 | baidu_zhidao.find_comments() 135 | -------------------------------------------------------------------------------- /TMallCommentSpider/tmall_comment_spider.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # @Author : Huangcc 3 | 4 | import urllib 5 | import re 6 | import json 7 | import time 8 | import random 9 | import requests 10 | import codecs 11 | import datetime 12 | from lxml import etree 13 | 14 | search_keyword = '美年健康' 15 | 16 | 17 | class TmallCommentSpider(): 18 | name = 'tmall_comment_spider' 19 | keyword = urllib.quote(search_keyword.decode('utf-8').encode('gbk')) 20 | search_url = 'https://list.tmall.com/search_product.htm?q={keyword}&type=p&vmarket=' \ 21 | '&spm=875.7931836%2FB.a2227oh.d100&from=mallfp..pc_1_searchbutton'.format(keyword=keyword) 22 | item_url = 'https://list.tmall.com/search_product.htm?spm=a220m.1000858.0.0.50092370Go9Qa5' \ 23 | '&s={start_index}&q={keyword}&sort=s&style=g&from=.list.pc_1_searchbutton' \ 24 | '&type=pc#J_Filter' 25 | comment_url = 'https://rate.tmall.com/list_detail_rate.htm' 26 | 27 | my_headers = { 28 | 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:57.0) Gecko/20100101 Firefox/57.0', 29 | 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 30 | 'Accept-Language': 'en-US,en;q=0.5', 31 | 'Accept-Encoding': 'gzip, deflate, br', 32 | } 33 | 34 | start_urls = [search_url] 35 | 36 | def __init__(self): 37 | self.filename = 'tmall-comments-%s.csv' % (datetime.datetime.now().strftime('%Y%m%d-%H%M')) 38 | self.session = requests.session() 39 | print self.keyword 40 | 41 | def pre_get(self): 42 | self.session.get(url='https://www.tmall.com/') 43 | 44 | def start_query(self): 45 | self.pre_get() 46 | my_headers = self.my_headers 47 | my_headers['Host'] = 'list.tmall.com' 48 | my_headers['Connection'] = 'keep-alive' 49 | my_headers['Upgrade-Insecure-Requests'] = '1' 50 | my_headers['Referer'] = 'https://www.tmall.com/' 51 | for url in self.start_urls: 52 | resp = self.session.get(url=url, headers=my_headers, allow_redirects=False) 53 | # resp = self.session.get(url=url, headers=my_headers) 54 | print resp.url 55 | print resp.content 56 | self.parse(resp) 57 | 58 | def parse(self, response): 59 | response = etree.HTML(response.content) 60 | total_page_selector = response.xpath('//input[@name="totalPage"]') 61 | if total_page_selector: 62 | total_page = total_page_selector[0].get('value') 63 | for page in range(total_page): 64 | start_index = page * 60 65 | page_url = self.item_url.format(start_index=start_index, keyword=self.keyword) 66 | resp = self.session.get(url=page_url, headers=self.my_headers) 67 | self.parse_search_result(resp) 68 | 69 | def parse_search_result(self, response): 70 | response = etree.HTML(response.content) 71 | item_urls = response.xpath('//p[@class="productStatus"]//a/@href').extract() 72 | comment_nums = response.xpath('//p[@class="productStatus"]//a/text()').extract() 73 | for i in range(len(item_urls)): 74 | if int(comment_nums[i]): 75 | resp = self.session.get(url='https:' + item_urls[i], headers=self.my_headers) 76 | self.parse_item(resp) 77 | 78 | def parse_item(self, response): 79 | resp_text = response.text.replace('\n', '') 80 | pattern = 'TShop\.Setup\((.*?)\);' 81 | result = re.findall(pattern, resp_text) 82 | 83 | data = { 84 | 'itemId': '', # 必填 85 | 'spuId': '', # 必填 86 | 'sellerId': '', # 必填 87 | 'order': '3', 88 | 'currentPage': '1', # 页 89 | 'append': '0', 90 | 'content': '1', 91 | 'tagId': '', 92 | 'posi': '', 93 | 'picture': '', 94 | 'ua': '', 95 | 'needFold': '0', 96 | '_ksTS': '', # 毫秒时间戳_四位随机数 97 | 'callback': '', # jsonp_四位随机数+1 98 | } 99 | 100 | if result: 101 | json_data = json.loads(result[0]) 102 | item_do = json_data['itemDO'] 103 | 104 | data['itemId'] = item_do['itemId'] 105 | data['spuId'] = item_do['spuId'] 106 | data['sellerId'] = item_do['userId'] 107 | random_int = random.randint(1000, 9998) 108 | data['_ksTS'] = '%d_%d' % (time.time() * 1000, random_int), # 毫秒时间戳_四位随机数 109 | data['callback'] = 'jsonp_%d' % (random_int + 1) 110 | 111 | my_headers = self.my_headers.copy() 112 | my_headers['Host'] = 'rate.tmall.com' 113 | my_headers['Referer'] = response.url 114 | 115 | resp = requests.get(self.comment_url, params=data, headers=my_headers) 116 | json_str = resp.content[len(data['callback']):-1] 117 | json_data = json.loads(json_str) 118 | max_pages = json_data['rateDetail']['paginator']['lastPage'] 119 | for i in range(max_pages): 120 | random_int = random.randint(1000, 9998) 121 | data['_ksTS'] = '%d_%d' % (time.time() * 1000, random_int), # 毫秒时间戳_四位随机数 122 | data['callback'] = 'jsonp_%d' % (random_int + 1) 123 | data['currentPage'] = str(i + 1) 124 | 125 | resp = self.session.get(url=self.comment_url, params=data, headers=my_headers) 126 | self.parse_comment(resp) 127 | 128 | def parse_comment(self, response): 129 | json_data = json.loads(response.text[len('jsonp_9999'):-1]) 130 | rate_list = json_data['rateDetail']['rateList'] 131 | for rate in rate_list: 132 | user_nickname = rate['displayUserNick'] 133 | user_id = rate['id'] 134 | rate_content = rate['rateContent'] 135 | rate_date = rate['rateDate'] 136 | 137 | with codecs.open(self.filename, 'a') as f: 138 | f.write('|'.join((user_id, user_nickname, rate_content, rate_date)) + '\n') 139 | 140 | if __name__ == '__main__': 141 | tmall = TmallCommentSpider() 142 | tmall.start_query() 143 | -------------------------------------------------------------------------------- /ZhenAiSpider/ZhenAi/ZhenAi/spiders/zhenai_spider.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | import scrapy 4 | import json 5 | import traceback 6 | import mymongo 7 | from utils import * 8 | from ZhenAi.items import ZhenaiItem 9 | from scrapy.utils.response import get_base_url 10 | 11 | 12 | class ZhenaiSpider(scrapy.Spider): 13 | name = 'zhenai_spider' 14 | # generate all base urls 15 | base_url = 'http://search.zhenai.com/v2/search/getPinterestData.do?sex={}&agebegin={}&ageend={}&workcityprovince={}' \ 16 | '&workcitycity={}&education={}' \ 17 | '&occupation={}&info=&marriage={}&h1={}&h2={}&salaryBegin={}&salaryEnd={}' \ 18 | '&h={}&c={}&workcityprovince1={}&workcitycity1={}&constellation={}&animals={}&stock={}&belief={}' \ 19 | '&lvBegin={}&lvEnd={}&condition=66&orderby=hpf&hotIndex=&online={}¤tpage={}&topSearch=false' 20 | sex = [0, 1] 21 | agebegin = range(18, 100) 22 | agebegin.append(-1) 23 | agebegin.reverse() 24 | ageend = range(18, 100) 25 | ageend.append(-1) 26 | ageend.reverse() 27 | workcityprovince = [-1] # TODO 28 | workcitycity = [-1] # TODO 29 | education = range(2, 8) 30 | education.append(-1) 31 | education.reverse() 32 | occupation = range(100, 2900, 100) 33 | occupation.append(-1) 34 | occupation.reverse() 35 | marriage = [-1, 1, 3, 4] # weihun, liyi, sang'ou 36 | h1 = range(129, 212) 37 | h1.append(-1) 38 | h1.reverse() 39 | h2 = range(129, 212) 40 | h2.append(-1) 41 | h2.reverse() 42 | salaryBegin = range(103, 109) # yue shou ru 43 | salaryBegin.append(-1) 44 | salaryBegin.reverse() 45 | salaryEnd = range(103, 109) 46 | salaryEnd.append(-1) 47 | salaryEnd.reverse() 48 | h = range(1, 6) # house 49 | h.append(-1) 50 | h.reverse() 51 | c = range(1, 6) # children 52 | c.append(-1) 53 | c.reverse() 54 | workcityprovince1 = [-1] # TODO 55 | workcitycity1 = [-1] # TODO 56 | constellation = range(1, 13) # xing zuo 57 | constellation.append(-1) 58 | constellation.reverse() 59 | animals = range(1, 13) # sheng xiao 60 | animals.append(-1) 61 | animals.reverse() 62 | stock = range(1, 58) # min zu 63 | stock.append(-1) 64 | stock.reverse() 65 | belief = range(1, 14) # not sure 66 | belief.append(-1) 67 | belief.reverse() 68 | lvBegin = range(1, 8) 69 | lvBegin.append(-1) 70 | lvBegin.reverse() 71 | lvEnd = range(1, 8) 72 | lvEnd.append(-1) 73 | lvEnd.reverse() 74 | online = [-1, 1] 75 | currentpage = range(1, 101) 76 | start_urls = url_generator(base_url, sex, agebegin, ageend, workcityprovince, workcitycity, 77 | education, occupation, marriage, h1, h2, salaryBegin, salaryEnd, h, c, 78 | workcityprovince1, workcitycity1, constellation, animals, stock, belief, lvBegin, lvEnd, 79 | online, currentpage) 80 | 81 | def parse(self, response): 82 | data = {} 83 | try: 84 | data = json.loads(response.text) 85 | except: 86 | traceback.print_exc() 87 | if data and 'data' in data: 88 | doc_list = data['data'] 89 | if len(doc_list) > 0: 90 | mongo = mymongo.MyMongo() 91 | mongo.insert_doc('ZhenAi', 'SimpleData', doc_list) 92 | for doc in doc_list: 93 | member_id = doc['memberId'] 94 | url = 'http://album.zhenai.com/u/{}?flag=s'.format(member_id) 95 | yield scrapy.Request(url=url, callback=self.parse_detail) 96 | 97 | def parse_detail(self, response): 98 | items = ZhenaiItem() 99 | html_selector = scrapy.Selector(response) 100 | url = get_base_url(response) 101 | items['pic_url'] = html_selector.xpath( 102 | '//div[@id="AblumsThumbsListID"]/ul/li/p/img[1]/@data-big-img').extract() 103 | 104 | honesty_charm = html_selector.xpath('//p[@class="brief-info fs14 lh32 c9f"]/span/span/text()').extract() 105 | honesty = '--' 106 | charm = '--' 107 | if len(honesty_charm) == 2: 108 | honesty = honesty_charm[0] 109 | charm = honesty_charm[1] 110 | zhima_info = html_selector.xpath( 111 | '//p[@class="brief-name lh32 blue"]//a[@class="flag-credit credit-js"]/text()').extract_first() 112 | if zhima_info: 113 | zhima_info = zhima_info.replace(u'\u5206', '') 114 | if not zhima_info.isdigit(): 115 | zhima_info = '--' 116 | else: 117 | zhima_info = '--' 118 | 119 | brief_table_td = html_selector.xpath( 120 | '//table[@class="brief-table"]//td').extract() # ['x: y'] 121 | brief_dict = {} 122 | for td in brief_table_td: 123 | key, value = get_brief_td_to_key_value(td) 124 | if key is not None and value is not None: 125 | brief_dict[key] = value 126 | 127 | nick_name = html_selector.xpath('//a[@class="name fs24"]/text()').extract_first() 128 | 129 | id_str = html_selector.xpath('//p[@class="brief-info fs14 lh32 c9f"]/text()').extract_first() 130 | id = re.findall('ID.*?(\d+)', id_str)[0] 131 | 132 | person_os = html_selector.xpath( 133 | '//div[@class="mod-tab-info"]//div[@class="info-item slider info-inner"]' 134 | '//p[@class="fs14 lh20 c5e slider-area-js"]/text()').extract() 135 | 136 | data_table_td = html_selector.xpath('//div[@class="info-floor floor-data posr clearfix"]//table//td').extract() 137 | data_dict = {} 138 | for td in data_table_td: 139 | key, value = get_info_td_to_key_value(td) 140 | if key is not None and value is not None: 141 | data_dict[key] = value 142 | 143 | life_table_td = html_selector.xpath('//div[@class="info-floor floor-life posr clearfix"]//table//td').extract() 144 | life_dict = {} 145 | for td in life_table_td: 146 | key, value = get_info_td_to_key_value(td) 147 | if key is not None and value is not None: 148 | life_dict[key] = value 149 | 150 | hobby_table_td = html_selector.xpath( 151 | '//div[@class="info-floor floor-hobby posr clearfix"]//table//td').extract() 152 | hobby_dict = {} 153 | for td in hobby_table_td: 154 | key, value = get_info_td_to_key_value(td) 155 | if key is not None and value is not None: 156 | hobby_dict[key] = value 157 | 158 | term_table_td = html_selector.xpath('//div[@class="info-floor floor-term posr clearfix"]//table//td').extract() 159 | term_dict = {} 160 | for td in term_table_td: 161 | key, value = get_info_td_to_key_value(td) 162 | if key is not None and value is not None: 163 | term_dict[key] = value 164 | 165 | all_data = { 166 | 'nick_name': nick_name, 167 | 'url': url, 168 | 'member_id': id, 169 | 'person_os': person_os, 170 | 'honesty': honesty, 171 | 'zhima': zhima_info, 172 | 'charm': charm, 173 | 'brief_data': brief_dict, 174 | 'data': data_dict, 175 | 'life': life_dict, 176 | 'hobby': hobby_dict, 177 | 'term': term_dict, 178 | 'pic_url': items['pic_url'] 179 | } 180 | mongo = mymongo.MyMongo() 181 | mongo.insert_doc('ZhenAi', 'CompleteData', all_data) 182 | 183 | return items 184 | -------------------------------------------------------------------------------- /JDCommentSpider/jdcomment.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # @Author : Huangcc 3 | 4 | import urllib 5 | import requests 6 | import re 7 | import math 8 | import time 9 | import random 10 | import json 11 | import codecs 12 | import datetime 13 | from collections import OrderedDict 14 | 15 | KEYWORD = '美年健康' 16 | 17 | # code start here 18 | URL_ENCODE_KEYWORD = urllib.quote(KEYWORD) 19 | # 搜索页 20 | SEARCH_REFERER = 'https://search.jd.com/Search?keyword={keyword}&enc=utf-8&qrst=1&rt=1&stop=1&vt=2&wq={keyword}&page={page_keyword}&s={start_item}&click=0' 21 | # 搜索结果接口 22 | SEARCH_URL = 'https://search.jd.com/s_new.php' 23 | # 搜索页Host 24 | SEARCH_HOST = 'search.jd.com' 25 | # 评论页 26 | COMMENT_REFERER = 'https://item.jd.com/ID.html' 27 | # 评论结果接口 28 | COMMENT_URL = 'https://sclub.jd.com/comment/productPageComments.action' 29 | # 评论页Host 30 | COMMENT_HOST = 'sclub.jd.com' 31 | 32 | CSV_SEQ = '|' # csv文件分隔符 33 | 34 | # replace用的占位符 35 | PAGE_KEYWORD = 'PAGE' 36 | START_ITEM_KEYWORD = 'SS' 37 | COMMENT_REFERER_KEYWORD = 'ID' 38 | 39 | # 初始化搜索页的referer 40 | SEARCH_REFERER = SEARCH_REFERER.format(keyword=URL_ENCODE_KEYWORD, page_keyword=PAGE_KEYWORD, 41 | start_item=START_ITEM_KEYWORD) 42 | 43 | DEFAULT_REQUEST_HEADERS = { 44 | # 'Host': 'search.jd.com', 45 | 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64; rv:52.0) Gecko/20100101 Firefox/52.0', 46 | 'Accept': '*/*', 47 | 'Accept-Language': 'zh-CN,zh;q=0.8,en-US;q=0.5,en;q=0.3', 48 | 'Accept-Encoding': 'gzip, deflate, br', 49 | 'X-Requested-With': 'XMLHttpRequest', 50 | # 'Referer': BASE_REFERER, 51 | 'Connection': 'keep-alive', 52 | 'Pragma': 'no-cache', 53 | 'Cache-Control': 'no-cache', 54 | } 55 | 56 | COMMENT_REQUEST_HEADERS = { 57 | # 'Host': 'sclub.jd.com', 58 | 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64; rv:52.0) Gecko/20100101 Firefox/52.0', 59 | 'Accept': '*/*', 60 | 'Accept-Language': 'zh-CN,zh;q=0.8,en-US;q=0.5,en;q=0.3', 61 | 'Accept-Encoding': 'gzip, deflate, br', 62 | # 'Referer': 'https://item.jd.com/12571462129.html', 63 | 'Connection': 'keep-alive', 64 | } 65 | 66 | ITEM_REQUEST_HEADERS = { 67 | 'Host': 'item.jd.com', 68 | 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64; rv:52.0) Gecko/20100101 Firefox/52.0', 69 | 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 70 | 'Accept-Language': 'zh-CN,zh;q=0.8,en-US;q=0.5,en;q=0.3', 71 | 'Accept-Encoding': 'gzip, deflate, br', 72 | 'Connection': 'keep-alive', 73 | 'Upgrade-Insecure-Requests': '1', 74 | 'If-Modified-Since': 'Wed, 08 Nov 2017 05:49:40 GMT', 75 | } 76 | 77 | DATA = { 78 | 'keyword': KEYWORD, 79 | 'enc': 'utf-8', 80 | 'qrst': '1', 81 | 'rt': '1', 82 | 'stop': '1', 83 | 'vt': '2', 84 | 'wq': KEYWORD, 85 | 'page': '1', 86 | 's': '1', 87 | 'psort': '4', # 按照评论数排序 88 | 'scrolling': 'y', 89 | 'tpl': '1_M', 90 | 'log_id': '1510033965.96458', 91 | 'show_items': '', 92 | } 93 | 94 | COMMENT_DATA = { 95 | # 'callback': 'fetchJSON_comment98vv152', # 后面三个数字随机 96 | # 'productId': '12571462129', # 商品id 97 | 'score': '0', 98 | 'sortType': '5', 99 | # 'page': '0', # 评论第x页 100 | 'pageSize': '10', 101 | 'isShadowSku': '0', 102 | 'fold': '1', 103 | } 104 | 105 | COMMENT_EXTRA_KEYWORDS = [ 106 | ('id', u'用户ID'), 107 | ('nickname', u'用户名'), 108 | ('content', u'评论内容'), 109 | ('creationTime', u'评论日期'), 110 | ('score', u'评分'), 111 | ('referenceName', u'商品名称'), 112 | ('productColor', u'产品类型'), 113 | ] 114 | 115 | COMMENT_EXTRA_KEYWORDS_DICT = OrderedDict() 116 | for key, value in COMMENT_EXTRA_KEYWORDS: 117 | COMMENT_EXTRA_KEYWORDS_DICT[key] = value 118 | 119 | 120 | def parse_html_to_get_ids(html_content): 121 | ids = [] 122 | pattern = '
  • 62 | 92 |
  • 93 | ``` 94 | 95 | ### 评论搜索api 96 | 1. `COMMENT_URL`接口会校验请求头中的`Host`和`Referer`部分,所以相关参数要严格设置 97 | 2. `page`从0开始,不分奇偶页 98 | 99 | #### url 100 | ```python 101 | # 评论页 102 | COMMENT_REFERER = 'https://item.jd.com/ID.html' 103 | # 评论结果接口 104 | COMMENT_URL = 'https://sclub.jd.com/comment/productPageComments.action' 105 | ``` 106 | #### headers 107 | ```python 108 | COMMENT_REQUEST_HEADERS = { 109 | 'Host': 'sclub.jd.com', 110 | 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64; rv:52.0) Gecko/20100101 Firefox/52.0', 111 | 'Accept': '*/*', 112 | 'Accept-Language': 'zh-CN,zh;q=0.8,en-US;q=0.5,en;q=0.3', 113 | 'Accept-Encoding': 'gzip, deflate, br', 114 | 'Referer': 'https://item.jd.com/12571462129.html', # 对应商品的页面 115 | 'Connection': 'keep-alive', 116 | } 117 | ``` 118 | #### params 119 | ```python 120 | COMMENT_DATA = { 121 | 'callback': 'fetchJSON_comment98vv152', # 后面三个数字随机 122 | 'productId': '12571462129', # 商品id 123 | 'score': '0', 124 | 'sortType': '5', 125 | 'page': '0', # 评论第x页 126 | 'pageSize': '10', 127 | 'isShadowSku': '0', 128 | 'fold': '1', 129 | } 130 | ``` 131 | #### response 132 | ```javascript 133 | fetchJSON_comment98vv122({ 134 | "productAttr": null, 135 | "productCommentSummary": { 136 | "goodRateShow": 97, 137 | "poorRateShow": 1, 138 | "poorCountStr": "10+", 139 | "averageScore": 5, 140 | "generalCountStr": "20+", 141 | "oneYear": 0, 142 | "showCount": 80, 143 | "showCountStr": "80+", 144 | "goodCount": 1000, 145 | "generalRate": 0.021, 146 | "generalCount": 20, 147 | "skuId": 2667959, 148 | "goodCountStr": "1000+", 149 | "poorRate": 0.011, 150 | "afterCount": 3, 151 | "goodRateStyle": 145, 152 | "poorCount": 10, 153 | "skuIds": null, 154 | "poorRateStyle": 2, 155 | "generalRateStyle": 3, 156 | "commentCountStr": "1000+", 157 | "commentCount": 1000, 158 | "productId": 2667959, 159 | "afterCountStr": "3", 160 | "defaultGoodCount": 300, 161 | "goodRate": 0.968, 162 | "generalRateShow": 2, 163 | "defaultGoodCountStr": "300+" 164 | }, 165 | "hotCommentTagStatistics": [{ 166 | "id": "1467260", 167 | "name": "送货快", 168 | "status": 0, 169 | "rid": "16727", 170 | "productId": 2667959, 171 | "count": 1, 172 | "modified": "2017-01-09 11:44:35", 173 | "type": 0, 174 | "canBeFiltered": false 175 | }], 176 | "jwotestProduct": "99", 177 | "maxPage": 66, 178 | "score": 0, 179 | "soType": 5, 180 | "imageListCount": 116, 181 | "vTagStatistics": null, 182 | "comments": [{ 183 | "id": 10827600829, 184 | "guid": "095f5c63-6c5d-4e7b-af58-5fe05c2003c7", 185 | "content": "检查完了,本人是在广州使用,服务态度没得说,非常好,每个科室医生检查详细,还有送早餐,以后每年检查都会去爱康国宾。", 186 | "creationTime": "2017-10-01 17:54:31", 187 | "isTop": false, 188 | "referenceId": "2667959", 189 | "referenceImage": "jfs/t7381/348/4133538107/380092/22e1dcab/59ffb879N4d2e5230.jpg", 190 | "referenceName": "爱康国宾(ikang)体检卡 深爱老公老婆体检套餐 全国门店通用", 191 | "referenceTime": "2017-09-07 18:29:05", 192 | "referenceType": "Product", 193 | "referenceTypeId": 0, 194 | "firstCategory": 9192, 195 | "secondCategory": 14203, 196 | "thirdCategory": 14204, 197 | "replyCount": 0, 198 | "score": 5, 199 | "status": 1, 200 | "title": "", 201 | "usefulVoteCount": 1, 202 | "uselessVoteCount": 0, 203 | "userImage": "misc.360buyimg.com/user/myjd-2015/css/i/peisong.jpg", 204 | "userImageUrl": "misc.360buyimg.com/user/myjd-2015/css/i/peisong.jpg", 205 | "userLevelId": "105", 206 | "userProvince": "", 207 | "viewCount": 0, 208 | "orderId": 0, 209 | "isReplyGrade": false, 210 | "nickname": "帅***飞", 211 | "userClient": 2, 212 | "images": [{ 213 | "id": 415870745, 214 | "associateId": 263301426, 215 | "productId": 0, 216 | "imgUrl": "//img30.360buyimg.com/n0/s128x96_jfs/t10873/14/490911554/1437197/5ba03569/59d11223Na36b22bd.jpg", 217 | "available": 1, 218 | "pin": "", 219 | "dealt": 0, 220 | "imgTitle": "", 221 | "isMain": 0, 222 | "jShow": 0 223 | }], 224 | "showOrderComment": { 225 | "id": 263301426, 226 | "guid": "9a24cb67-b7e0-40a6-ac1c-9960ff5d2707", 227 | "content": "检查完了,本人是在广州使用,服务态度没得说,非常好,每个科室医生检查详细,还有送早餐,以后每年检查都会去爱康国宾。", 228 | "creationTime": "2017-10-02 00:04:51", 229 | "isTop": false, 230 | "referenceId": "2667959", 231 | "referenceType": "Order", 232 | "referenceTypeId": 0, 233 | "firstCategory": 0, 234 | "secondCategory": 0, 235 | "thirdCategory": 0, 236 | "replyCount": 0, 237 | "score": 0, 238 | "status": 1, 239 | "usefulVoteCount": 0, 240 | "uselessVoteCount": 0, 241 | "userProvince": "", 242 | "viewCount": 0, 243 | "orderId": 0, 244 | "isReplyGrade": false, 245 | "userClient": 2, 246 | "isDeal": 1, 247 | "integral": -20, 248 | "userImgFlag": 0, 249 | "anonymousFlag": 1, 250 | "recommend": false, 251 | "userLevelColor": "#666666", 252 | "userClientShow": "来自京东iPhone客户端", 253 | "isMobile": true 254 | }, 255 | "mergeOrderStatus": 2, 256 | "discussionId": 263301426, 257 | "productColor": "深爱老公老婆", 258 | "productSize": "", 259 | "imageCount": 3, 260 | "integral": -20, 261 | "userImgFlag": 0, 262 | "anonymousFlag": 1, 263 | "userLevelName": "PLUS会员", 264 | "plusAvailable": 201, 265 | "userExpValue": 31721, 266 | "productSales": [], 267 | "recommend": true, 268 | "userLevelColor": "#e1a10a", 269 | "userClientShow": "来自京东iPhone客户端", 270 | "isMobile": true, 271 | "days": 24, 272 | "afterDays": 0 273 | }, 274 | ] 275 | }); 276 | ``` -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "{}" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright {yyyy} {name of copyright owner} 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. 202 | -------------------------------------------------------------------------------- /ZhihuSpider/zhihu_spider.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # @Author : Huangcc 3 | 4 | import requests 5 | import json 6 | import re 7 | import base64 8 | import hmac 9 | import hashlib 10 | import time 11 | import datetime 12 | import codecs 13 | from PIL import Image 14 | 15 | import sys 16 | 17 | reload(sys) 18 | sys.setdefaultencoding("utf-8") 19 | 20 | 21 | class Zhihu(object): 22 | sigup_url = 'https://www.zhihu.com/signup' 23 | home_url = 'https://www.zhihu.com' 24 | sigin_url = 'https://www.zhihu.com/api/v3/oauth/sign_in' 25 | search_url = 'https://www.zhihu.com/search' 26 | search_api_url = "https://www.zhihu.com/api/v4/search_v3" 27 | question_page_url = 'https://www.zhihu.com/question/' 28 | comment_api_url = 'https://www.zhihu.com/api/v4/questions/{question_id}/answers' 29 | 30 | client_id = 'c3cef7c66a1843f8b3a9e6a1e3160e20' 31 | authorization = 'oauth ' + client_id 32 | 33 | user_agent = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) ' \ 34 | 'Chrome/64.0.3282.186 Safari/537.36' 35 | 36 | simple_headers = { 37 | 'User-Agent': user_agent, 38 | } 39 | 40 | headers_sigup = { 41 | 'User-Agent': user_agent, 42 | 'Connection': 'keep-alive', 43 | 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8', 44 | 'Accept-Encoding': 'gzip, deflate, br', 45 | 'Accept-Language': 'zh-CN,zh;q=0.9', 46 | 'Host': 'www.zhihu.com' 47 | } 48 | 49 | headers_sigin = { 50 | 'authorization': authorization, 51 | 'Referer': sigup_url, 52 | 'Origin': home_url, 53 | 'User-Agent': user_agent, 54 | } 55 | 56 | headers_captcha = { 57 | 'authorization': authorization, 58 | 'Referer': sigup_url, 59 | 'User-Agent': user_agent, 60 | } 61 | 62 | login_payload = { 63 | 'client_id': client_id, 64 | 'grant_type': 'password', 65 | 'source': 'com.zhihu.web', 66 | 'lang': 'en', 67 | 'ref_source': 'other', 68 | 'utm_source': None, 69 | } 70 | 71 | search_api_headers = { 72 | 'accept': 'application/json, text/plain, */*', 73 | 'Accept-Encoding': 'gzip, deflate, br', 74 | 'Accept-Language': 'zh-CN,zh;q=0.9', 75 | # 'authorization': 'Bearer 2|1:0|10:1520824661|4:z_c0|92:Mi4xOW43QUF3QUFBQUFBVU1JQkxabUREQ1lBQUFCZ0FsVk5WRC1UV3dEY1hZZzl0QjRxVDFHSmpKbFFCY2NpT0lqVlNR|ae27d5db5fb5be6be4a9e8dcfb871161169b9cd00eb9265341366fbadabffaca', 76 | # 'Cookie': '_zap=a5e45e29-dbb0-4bcc-b318-23dfca5fd933; q_c1=bf06d2672d984917b2f06efa033cc30f|1505131157000|1502242612000; d_c0="AFDCAS2ZgwyPTmHKenJ488LtpT5Eu0sVI_o=|1507769845"; __utma=51854390.746326380.1507769846.1512042244.1512536288.6; __utmz=51854390.1512536288.6.6.utmcsr=zhihu.com|utmccn=(referral)|utmcmd=referral|utmcct=/question/41295948; __utmv=51854390.000--|2=registration_date=20161129=1^3=entry_date=20170809=1; q_c1=bf06d2672d984917b2f06efa033cc30f|1520305668000|1502242612000; aliyungf_tc=AQAAAIhMQwyLaQkA7G4Ot+e2JkBHD3BH; _xsrf=23549cd1-24ba-47c3-be8b-cd31a966c66b; capsion_ticket="2|1:0|10:1520824649|14:capsion_ticket|44:NjE3ZGEyYjkzNTIwNDJiNGFjZDdiMjIyMWFhMGYxMjA=|40c554d334082285d04501cd6739f752f3e07145745d495bfcd32c00ebf5d5f1"; z_c0="2|1:0|10:1520824661|4:z_c0|92:Mi4xOW43QUF3QUFBQUFBVU1JQkxabUREQ1lBQUFCZ0FsVk5WRC1UV3dEY1hZZzl0QjRxVDFHSmpKbFFCY2NpT0lqVlNR|ae27d5db5fb5be6be4a9e8dcfb871161169b9cd00eb9265341366fbadabffaca"', 77 | 'Host': 'www.zhihu.com', 78 | # 'Referer': 'https://www.zhihu.com/search?type=content&q=python', 79 | 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.146 Safari/537.36', 80 | 'X-API-Version': '3.0.91', 81 | 'X-App-Za': 'OS=Web', 82 | # 'X-UDID': 'AFDCAS2ZgwyPTmHKenJ488LtpT5Eu0sVI_o=', 83 | } 84 | 85 | search_api_payload = { 86 | 't': 'general', 87 | 'correction': '1', 88 | 'limit': '10', 89 | 'q': 'python', 90 | 'search_hash_id': '0ca5c03842318b3fdb51cfc4c11340e9', 91 | 'offset': '0' 92 | } 93 | 94 | comment_api_headers = { 95 | 'accept': 'application/json, text/plain, */*', 96 | 'Accept-Language': 'zh-CN,zh;q=0.9', 97 | 'Host': 'www.zhihu.com', 98 | 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.146 Safari/537.36', 99 | } 100 | 101 | comment_api_payload = { 102 | 'sort_by': 'default', 103 | 'include': 'data[*].is_normal,admin_closed_comment,reward_info,is_collapsed,annotation_action,annotation_detail,collapse_reason,is_sticky,collapsed_by,suggest_edit,comment_count,can_comment,content,editable_content,voteup_count,reshipment_settings,comment_permission,created_time,updated_time,review_info,relevant_info,question,excerpt,relationship.is_authorized,is_author,voting,is_thanked,is_nothelp,upvoted_followees;data[*].mark_infos[*].url;data[*].author.follower_count,badge[?(type=best_answerer)].topics', 104 | 'limit': '5', 105 | 'offset': '5', 106 | } 107 | 108 | def __init__(self, username, password): 109 | self.username, self.password = username, password 110 | self.session = requests.session() 111 | 112 | def get_token(self): 113 | resp = self.session.get(self.sigup_url, headers=self.headers_sigup, allow_redirects=False) 114 | return resp.cookies['_xsrf'] 115 | 116 | def get_captcha(self): 117 | captcha_url = 'https://www.zhihu.com/api/v3/oauth/captcha' 118 | query_string_parameters = {'lang': 'en'} 119 | resp = self.session.get(captcha_url, data=query_string_parameters, headers=self.headers_captcha) 120 | 121 | if json.loads(resp.text)['show_captcha']: 122 | resp = self.session.put(captcha_url, data=query_string_parameters, headers=self.headers_captcha) 123 | print resp.content 124 | img_data = base64.b64decode(resp.json()['img_base64']) 125 | with open('captcha_zhihu.png', 'wb') as f: 126 | f.write(img_data) 127 | 128 | # 打开验证码 129 | image = Image.open('captcha_zhihu.png') 130 | image.show() 131 | 132 | captcha = raw_input(u'请输入验证码:') 133 | return captcha 134 | else: 135 | return None 136 | 137 | def get_signature(self, timestamp): 138 | h = hmac.new(b'd1b964811afb40118a12068ff74a12f4', digestmod=hashlib.sha1) 139 | grant_type = self.login_payload['grant_type'] 140 | source = self.login_payload['source'] 141 | h.update(grant_type + self.client_id + source + timestamp) 142 | return h.hexdigest() 143 | 144 | def check_login(self): 145 | resp = self.session.get(self.sigup_url, allow_redirects=True, headers=self.simple_headers) 146 | if resp.url == self.home_url: 147 | return True 148 | 149 | def login(self): 150 | xsrf_token = self.get_token() 151 | timestamp = str(int(time.time() * 1000)) 152 | signature = self.get_signature(timestamp) 153 | captcha = self.get_captcha() 154 | self.login_payload.update({ 155 | 'username': self.username, 156 | 'password': self.password, 157 | 'timestamp': timestamp, 158 | 'signature': signature, 159 | 'captcha': captcha, 160 | }) 161 | self.headers_sigin.update({'X-Xsrftoken': xsrf_token}) 162 | resp = self.session.post(self.sigin_url, data=self.login_payload, headers=self.headers_sigin, 163 | allow_redirects=False) 164 | check = self.check_login() 165 | if 'error' in resp.text: 166 | print resp.text 167 | elif check: 168 | print u'登陆成功!' 169 | 170 | @staticmethod 171 | def html_tags_eraser(htmls): 172 | htmls = str(htmls) 173 | pattern = re.compile(r'<[^>]+>', re.S) 174 | return pattern.sub('', htmls).replace('|', ' ').replace('\r', '').replace('\n', '') 175 | 176 | def search_questions(self, keyword): 177 | # 获取search_hash_id 178 | search_payload = { 179 | 'type': 'content', 180 | 'q': keyword 181 | } 182 | resp = self.session.get(url=self.search_url, params=search_payload, headers=self.simple_headers) 183 | referer = resp.url 184 | 185 | hash_id = None 186 | hash_id_pattern = "search_hash_id=([\d\w]+)" 187 | result = re.search(hash_id_pattern, resp.content) 188 | if result: 189 | hash_id = result.group(1) 190 | 191 | # 从cookies获取x-uuid和authorization 192 | self.session.get(url=self.search_url, params=search_payload, headers=self.simple_headers) 193 | print self.session.cookies 194 | x_uuid = self.session.cookies['d_c0'].split('|')[0].replace('"', '') 195 | authorization = 'Bearer ' + self.session.cookies['z_c0'].replace('"', '') 196 | 197 | self.search_api_headers.update({'authorization': authorization, 'Referer': referer, 'X-UDID': x_uuid}) 198 | print self.search_api_headers 199 | 200 | questions = [] 201 | offset = 0 202 | limit = 10 203 | while True: 204 | print 'Getting search result {} - {}'.format(offset, offset + limit) 205 | self.search_api_payload.update({'q': keyword, 'search_hash_id': hash_id, 206 | 'offset': str(offset), 'limit': str(limit)}) 207 | resp = self.session.get(self.search_api_url, params=self.search_api_payload, 208 | headers=self.search_api_headers) 209 | json_data = resp.json() 210 | 211 | is_end = json_data['paging']['is_end'] 212 | items = json_data['data'] 213 | # 针对question做特殊过滤 214 | questions.extend([item['object']['question']['url'].split('/')[-1] for item in items 215 | if 'object' in item and 'question' in item['object']]) 216 | 217 | if is_end: 218 | break 219 | else: 220 | offset += limit 221 | 222 | return questions 223 | 224 | def get_comments(self, question_ids): 225 | # 构建输出文件 226 | filename = 'zhihu-%s-comments.csv' % (datetime.datetime.now().strftime('%Y%m%d-%H%M')) 227 | # 写入表头 228 | with codecs.open(filename, 'w', 'utf_8_sig') as f: 229 | f.write(u'评论id|评论url|评论内容|评论创建时间|评论更新时间|作者|作者url|作者id|问题id|问题标题' 230 | u'|问题url|问题创建时间|问题更新时间\r\n') 231 | 232 | limit = 5 233 | 234 | for question_id in question_ids: 235 | print 'Processing Question {}'.format(question_id) 236 | question_page_url = self.question_page_url + question_id 237 | comment_api_url = self.comment_api_url.format(question_id=question_id) 238 | print comment_api_url 239 | 240 | # 从cookies获取x-uuid和authorization 241 | self.session.get(url=question_page_url, headers=self.simple_headers) 242 | print self.session.cookies 243 | x_uuid = self.session.cookies['d_c0'].split('|')[0].replace('"', '') 244 | authorization = 'Bearer ' + self.session.cookies['z_c0'].replace('"', '') 245 | # 组装请求头 246 | self.comment_api_headers.update({'authorization': authorization, 'Referer': question_page_url, 247 | 'X-UDID': x_uuid}) 248 | print self.comment_api_headers 249 | 250 | offset = 0 251 | while True: 252 | print 'Getting comment result {} - {}'.format(offset, offset + limit) 253 | self.comment_api_payload.update({'limit': str(limit), 'offset': str(offset)}) 254 | resp = self.session.get(comment_api_url, params=self.comment_api_payload, 255 | headers=self.comment_api_headers) 256 | print resp.text 257 | json_data = resp.json() 258 | 259 | is_end = json_data['paging']['is_end'] 260 | items = json_data['data'] 261 | 262 | for item in items: 263 | created_time = item['created_time'] 264 | updated_time = item['updated_time'] 265 | id_ = item['id'] 266 | url = item['url'] 267 | content = item['content'] 268 | 269 | author_name = item['author']['name'] 270 | author_url = item['author']['url'] 271 | author_id = item['author']['id'] 272 | 273 | question_title = item['question']['title'] 274 | question_url = item['question']['url'] 275 | question_created = item['question']['created'] 276 | question_updated = item['question']['updated_time'] 277 | 278 | with codecs.open(filename, 'a', 'utf_8_sig') as f: 279 | content = '|'.join( 280 | map(self.html_tags_eraser, (id_, url, content, created_time, updated_time, author_name, 281 | author_url, author_id, question_id, question_title, 282 | question_url, 283 | question_created, question_updated))) + '\r\n' 284 | print content 285 | f.write(content) 286 | 287 | if is_end: 288 | break 289 | offset += limit 290 | 291 | 292 | if __name__ == '__main__': 293 | zhihu = Zhihu('username', 'password') 294 | zhihu.login() 295 | question_ids = zhihu.search_questions('python') 296 | print len(question_ids) 297 | print question_ids 298 | zhihu.get_comments(question_ids) 299 | -------------------------------------------------------------------------------- /WeiboCommentSpider/weibocomment.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # @Author : Huangcc 3 | 4 | import urllib 5 | import base64 6 | import rsa 7 | import binascii 8 | import requests 9 | import time 10 | import json 11 | import datetime 12 | import codecs 13 | import re 14 | from collections import defaultdict 15 | 16 | MAX_INT = 999999 17 | 18 | 19 | class Weibo(): 20 | def __init__(self, user, passwd): 21 | self.username = user 22 | self.password = passwd 23 | 24 | self.get_login_params_url = 'https://login.sina.com.cn/sso/prelogin.php' 25 | self.get_login_params_headers = { 26 | 'Host': 'login.sina.com.cn', 27 | 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64; rv:52.0) Gecko/20100101 Firefox/52.0', 28 | 'Accept': '*/*', 29 | 'Accept-Language': 'zh-CN,zh;q=0.8,en-US;q=0.5,en;q=0.3', 30 | 'Accept-Encoding': 'gzip, deflate, br', 31 | 'Referer': 'https://weibo.com/login.php', 32 | 'Connection': 'keep-alive', 33 | } 34 | self.get_login_params_data = { 35 | 'entry': 'weibo', 36 | 'callback': 'sinaSSOController.preloginCallBack', 37 | 'su': '', 38 | 'rsakt': 'mod', 39 | 'client': 'ssologin.js(v1.4.19)', 40 | '_': '1510196975574', # 毫秒级别的时间戳 41 | } 42 | 43 | self.login_params = { 44 | 'entry': 'weibo', 45 | 'gateway': '1', 46 | 'from': '', 47 | 'savestate': '7', 48 | 'qrcode_flag': 'false', 49 | 'useticket': '1', 50 | 'pagerefer': '', 51 | 'vsnf': '1', 52 | # 'su': 'MTUxMTk5MDQ5NzElNDBzaW5hLmNu', 53 | 'service': 'miniblog', 54 | # 'servertime': '1510195867', 55 | # 'nonce': 'P6ZMJ7', 56 | 'pwencode': 'rsa2', 57 | # 'rsakv': '1330428213', 58 | # 'sp': '', 59 | 'sr': '1920*1080', 60 | 'encoding': 'UTF-8', 61 | 'prelt': '210', 62 | 'url': 'https://weibo.com/ajaxlogin.php?framelogin=1&callback=parent.sinaSSOController.feedBackUrlCallBack', 63 | 'returntype': 'META', 64 | } 65 | self.login_url = 'https://login.sina.com.cn/sso/login.php?client=ssologin.js(v1.4.19)' 66 | self.login_headers = { 67 | 'Host': 'login.sina.com.cn', 68 | 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64; rv:52.0) Gecko/20100101 Firefox/52.0', 69 | 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 70 | 'Accept-Language': 'zh-CN,zh;q=0.8,en-US;q=0.5,en;q=0.3', 71 | 'Accept-Encoding': 'gzip, deflate, br', 72 | 'Referer': 'https://weibo.com/login.php', 73 | 'Connection': 'keep-alive', 74 | 'Upgrade-Insecure-Requests': '1' 75 | } 76 | 77 | self.search_headers = { 78 | 'Host': 'm.weibo.cn', 79 | 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64; rv:52.0) Gecko/20100101 Firefox/52.0', 80 | 'Accept': 'application/json, text/plain, */*', 81 | 'Accept-Language': 'zh-CN,zh;q=0.8,en-US;q=0.5,en;q=0.3', 82 | 'Accept-Encoding': 'gzip, deflate, br', 83 | 'X-Requested-With': 'XMLHttpRequest', 84 | 'Referer': '', 85 | } 86 | self.search_referer = 'https://m.weibo.cn/p/100103type%3D2%26q%3D{keyword}?type=wb&queryVal={keyword}' \ 87 | '&featurecode=20000320&luicode=10000011&lfid=106003type%3D1&title={keyword}' 88 | self.search_url = 'https://m.weibo.cn/api/container/getIndex' 89 | self.search_data = { 90 | 'type': 'wb', 91 | 'queryVal': '{keyword}', 92 | 'featurecode': '20000320', 93 | 'luicode': '10000011', 94 | 'lfid': '106003type=1', 95 | 'title': '{keyword}', 96 | 'containerid': '100103type=2&q={keyword}', 97 | } 98 | 99 | self.comment_headers = { 100 | 'Host': 'm.weibo.cn', 101 | 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64; rv:52.0) Gecko/20100101 Firefox/52.0', 102 | 'Accept': 'application/json, text/plain, */*', 103 | 'Accept-Language': 'zh-CN,zh;q=0.8,en-US;q=0.5,en;q=0.3', 104 | 'Accept-Encoding': 'gzip, deflate, br', 105 | 'X-Requested-With': 'XMLHttpRequest', 106 | # 'Referer': 'https://m.weibo.cn/status/4172328575544621', 107 | 'Connection': 'keep-alive', 108 | } 109 | self.comment_referer = 'https://m.weibo.cn/status/{weibo_id}' 110 | self.comment_url = 'https://m.weibo.cn/api/comments/show' 111 | self.comment_data = { 112 | 'id': '{id}', 113 | 'page': '{page}', 114 | } 115 | 116 | self.session = requests.session() 117 | self.servertime = None 118 | self.nonce = None 119 | self.pubkey = None 120 | self.rsakv = None 121 | self.keyword = None 122 | self.max_try = 5 123 | self.weibo_ids = [] 124 | self.csv_seq = '|' 125 | 126 | def get_b64_username(self): 127 | username = urllib.quote(self.username) 128 | username = base64.encodestring(username)[:-1] 129 | return username 130 | 131 | def get_rsa_password(self, pubkey, nonce, server_time): 132 | # var RSAKey = new sinaSSOEncoder.RSAKey(); 133 | # RSAKey.setPublic(me.rsaPubkey, "10001"); 134 | # password = RSAKey.encrypt([me.servertime, me.nonce].join("\t") + "\n" + password) 135 | rsaPublickey = int(pubkey, 16) 136 | key = rsa.PublicKey(rsaPublickey, int("10001", 16)) # 创建公钥 137 | message = str(server_time) + '\t' + str(nonce) + '\n' + str(self.password) # 拼接明文js加密文件中得到 138 | passwd = rsa.encrypt(message, key) # 加密 139 | passwd = binascii.b2a_hex(passwd) # 将加密信息转换为16进制 140 | return passwd 141 | 142 | def get_login_params(self): 143 | self.get_login_params_data['_'] = int(1000 * time.time()) 144 | resp = self.session.get(self.get_login_params_url, params=self.get_login_params_data, 145 | headers=self.get_login_params_headers) 146 | raw_data = resp.content 147 | raw_data = raw_data[len('sinaSSOController.preloginCallBack('):-1] 148 | json_data = json.loads(raw_data) 149 | return json_data['servertime'], json_data['nonce'], json_data['pubkey'], json_data['rsakv'] 150 | 151 | def set_login_params(self, servertime, nonce, pubkey, rsakv): 152 | self.servertime, self.nonce, self.pubkey, self.rsakv = servertime, nonce, pubkey, rsakv 153 | 154 | def login(self): 155 | user = self.get_b64_username() 156 | passwd = self.get_rsa_password(self.pubkey, self.nonce, self.servertime) 157 | self.login_params['su'] = user 158 | self.login_params['servertime'] = self.servertime 159 | self.login_params['nonce'] = self.nonce 160 | self.login_params['rsakv'] = self.rsakv 161 | self.login_params['sp'] = passwd 162 | 163 | resp = self.session.post(self.login_url, data=self.login_params, headers=self.login_headers) 164 | if 'retcode%3D0' in resp.content: 165 | print 'login success' 166 | return True 167 | print 'login fail' 168 | print resp.content 169 | return False 170 | 171 | def set_search_keyword(self, keyword): 172 | self.keyword = keyword 173 | self.search_headers['Referer'] = self.search_referer.format(keyword=urllib.quote(self.keyword)) 174 | self.search_data['queryVal'] = self.search_data['queryVal'].format(keyword=self.keyword) 175 | self.search_data['title'] = self.search_data['title'].format(keyword=self.keyword) 176 | self.search_data['containerid'] = self.search_data['containerid'].format(keyword=self.keyword) 177 | 178 | def parse_weibo_response(self, json_data): 179 | mids = [] 180 | if json_data['cards']: 181 | cards = json_data['cards'][0]['card_group'] 182 | for card in cards: 183 | mblog = card['mblog'] 184 | mid = mblog['mid'] # 微博id 185 | comments_count = mblog['comments_count'] # 评论数量 186 | if comments_count: 187 | mids.append(mid) 188 | # text = mblog['text'] # 微博内容,html形式 189 | # user = mblog['user'] # 发布者 190 | # user_id = user['id'] 191 | # user_name = user['screen_name'] 192 | # user_desc = user['description'] 193 | return mids 194 | 195 | def set_max_try(self, n): 196 | self.max_try = n 197 | 198 | def get_weibo_item_ids(self): 199 | mids = [] 200 | is_end = False 201 | page_try = defaultdict(lambda: 0) 202 | page = 1 203 | while not is_end: 204 | self.search_data['page'] = str(page) 205 | page_try[page] += 1 206 | if page_try[page] > self.max_try: 207 | print 'page %d is more than max tries %d' % (page, self.max_try) 208 | page += 1 209 | 210 | resp = requests.get(self.search_url, params=self.search_data, headers=self.search_headers) 211 | json_data = {'ok': 0} 212 | try: 213 | json_data = resp.json() 214 | except: 215 | print resp.content 216 | if json_data['ok']: 217 | print 'start page %d' % page 218 | mids.extend(self.parse_weibo_response(json_data)) # 获取所有带评论的微博id 219 | if not json_data['cardlistInfo']['page']: 220 | is_end = True 221 | else: 222 | page += 1 223 | else: 224 | print 'page %d not ok' % page 225 | print 'getting weibo search result is to be end: %d' % len(mids) 226 | return mids 227 | 228 | def set_weibo_ids(self, ids): 229 | self.weibo_ids = ids 230 | 231 | def html_filter(self, html_text): 232 | pattern = re.compile(r'<[^>]+>', re.S) 233 | no_html_text = pattern.sub('', html_text) 234 | return no_html_text 235 | 236 | def parse_comment_data(self, json_data): 237 | comments = [] 238 | if 'data' in json_data and json_data['data']: 239 | for comment in json_data['data']: 240 | comment_id = unicode(comment['id']) 241 | user_id = unicode(comment['user']['id']) 242 | user_name = unicode(comment['user']['screen_name']) 243 | comment_content = unicode(comment['text']) 244 | comment_content = self.html_filter(comment_content) 245 | comments.append((comment_id, user_id, user_name, comment_content)) 246 | return comments 247 | 248 | def curl_comments(self, filename=None): 249 | # 创建文件并写入表头 250 | if not filename: 251 | filename = 'weibo-comments-%s.csv' % (datetime.datetime.now().strftime('%Y%m%d-%H%M')) 252 | with codecs.open(filename, 'a', 'utf-8') as f: 253 | f.write(self.csv_seq.join((u'评论id', u'用户id', u'用户名', u'评论')) + '\n') 254 | 255 | current_id_pos = 1 256 | max_id_pos = len(self.weibo_ids) 257 | for id_ in self.weibo_ids: 258 | print '-*- start weibo page %d/%d -*-' % (current_id_pos, max_id_pos) 259 | current_id_pos += 1 260 | 261 | # 初始化评论页迭代参数 262 | page = 1 263 | page_try = defaultdict(lambda: 0) 264 | comment_headers = self.comment_headers.copy() 265 | comment_headers['Referer'] = self.comment_referer.format(weibo_id=id_) 266 | comment_data = self.comment_data.copy() 267 | comment_data['id'] = id_ 268 | max_page = MAX_INT 269 | # 抓取评论页 270 | while page <= max_page: 271 | # 控制尝试次数,超出最大尝试次数则退出 272 | page_try[page] += 1 273 | if page_try[page] > self.max_try: 274 | print 'comment page %d/%d is more than max tries %d' % (page, max_page, self.max_try) 275 | page += 1 276 | # 超出最大页数则退出评论爬取 277 | if page > max_page: 278 | print 'page %d is more than max_page %d' % (page, max_page) 279 | break 280 | # 无法获取最大页数则退出评论爬取 281 | if page > 1 and max_page == MAX_INT: 282 | print 'No next page %d' % page 283 | break 284 | 285 | # 开始获取评论 286 | comment_data['page'] = str(page) 287 | resp = self.session.get(self.comment_url, params=comment_data, headers=comment_headers) 288 | json_data = {'ok': 0} 289 | try: 290 | json_data = resp.json() 291 | except ValueError: 292 | # 返回不正确则直接退出评论抓取(接口限制) 293 | break 294 | if json_data['ok']: 295 | print 'start comment page %d/%d' % (page, max_page) 296 | max_page = min(json_data['max'], max_page) # 最大评论页数 297 | comments = self.parse_comment_data(json_data) # 获取当前评论页评论 298 | with codecs.open(filename, 'a', 'utf-8') as f: 299 | for comment in comments: 300 | f.write(self.csv_seq.join(comment) + '\n') 301 | 302 | # 页数控制 303 | page += 1 304 | else: 305 | print 'comment page %d/%d not ok' % (page, max_page) 306 | 307 | 308 | if __name__ == '__main__': 309 | weibo = Weibo('your_weibo_username', 'your_weibo_password') 310 | servertime, nonce, pubkey, rsakv = weibo.get_login_params() 311 | weibo.set_login_params(servertime, nonce, pubkey, rsakv) 312 | if weibo.login(): 313 | weibo.set_search_keyword('冯提莫') 314 | ids = weibo.get_weibo_item_ids() 315 | weibo.set_weibo_ids(ids) 316 | weibo.curl_comments() 317 | else: 318 | print 'Make sure that your Weibo username and password are right' 319 | -------------------------------------------------------------------------------- /WeiboCommentSpider/weibo_comment_result_example.json: -------------------------------------------------------------------------------- 1 | { 2 | "ok": 1, 3 | "msg": "\u6570\u636e\u83b7\u53d6\u6210\u529f", 4 | "data": [ 5 | { 6 | "id": 4172544283687415, 7 | "created_at": "1\u5206\u949f\u524d", 8 | "source": "\u9b45\u65cf PRO 5", 9 | "user": { 10 | "id": 2136636343, 11 | "screen_name": "\u7af9\u897fC", 12 | "profile_image_url": "https:\/\/tva3.sinaimg.cn\/crop.0.19.233.233.180\/7f5a7bb7jw1ealqnreu9oj206j08bmxf.jpg", 13 | "verified": false, 14 | "verified_type": -1, 15 | "mbtype": 0, 16 | "profile_url": "https:\/\/m.weibo.cn\/u\/2136636343?uid=2136636343", 17 | "remark": "" 18 | }, 19 | "text": "\u4e0d\u662f\u840c\u5c0f\u561b\"[\u5141\u60b2]\"<\/span>\"[\u5141\u60b2]\"<\/span>\"[\u5141\u60b2]\"<\/span>", 20 | "like_counts": 0, 21 | "liked": false 22 | }, 23 | { 24 | "id": 4172544233718438, 25 | "created_at": "1\u5206\u949f\u524d", 26 | "source": "\u5c0f\u7c73Max2 \u5927\u5c4f\u5927\u7535\u91cf", 27 | "user": { 28 | "id": 5322013701, 29 | "screen_name": "JANUARYFEBRUARYMARCHAPRIL", 30 | "profile_image_url": "https:\/\/tvax2.sinaimg.cn\/crop.0.0.1002.1002.180\/005OaCSFly8flaxg6qj56j30ru0ru7ae.jpg", 31 | "verified": false, 32 | "verified_type": -1, 33 | "mbtype": 2, 34 | "profile_url": "https:\/\/m.weibo.cn\/u\/5322013701?uid=5322013701", 35 | "remark": "" 36 | }, 37 | "text": "\u56de\u590d@\u61d2\u7c73\u7c738711<\/a>:\u770b\u540e\u8fb9\u5199\u7684\u5b57\u513f\"[\u7b11cry]\"<\/span>", 38 | "reply_id": 4172543659142497, 39 | "reply_text": "\u56de\u590d@iPanda\u718a\u732b\u9891\u9053<\/a>:\u5976\u7238\u8bf4\u662f\u840c\u5c0f", 40 | "like_counts": 0, 41 | "liked": false 42 | }, 43 | { 44 | "id": 4172544187200590, 45 | "created_at": "1\u5206\u949f\u524d", 46 | "source": "iPhone\u5ba2\u6237\u7aef", 47 | "user": { 48 | "id": 1832833387, 49 | "screen_name": "sevenlife", 50 | "profile_image_url": "https:\/\/tvax4.sinaimg.cn\/crop.0.0.512.512.180\/6d3ed16bly8fe87n2w0x7j20e80e8mx8.jpg", 51 | "verified": false, 52 | "verified_type": -1, 53 | "mbtype": 0, 54 | "profile_url": "https:\/\/m.weibo.cn\/u\/1832833387?uid=1832833387", 55 | "remark": "" 56 | }, 57 | "text": "\u8fd9\u7167\u7247\u662f\u4e0d\u662f\u6709\u70b9\u50cf\u7834\u4ea7\u59d0\u59b9\u91cc\u7684\u90a3\u8c01", 58 | "like_counts": 0, 59 | "liked": false 60 | }, 61 | { 62 | "id": 4172544179133133, 63 | "created_at": "1\u5206\u949f\u524d", 64 | "source": "\u5c0f\u7c735X \u62cd\u4eba\u66f4\u7f8e", 65 | "user": { 66 | "id": 3225859821, 67 | "screen_name": "\u5403\u751c\u7684\u767d\u65e5\u68a6", 68 | "profile_image_url": "https:\/\/tva3.sinaimg.cn\/crop.0.0.664.664.180\/c046b6edjw8ezwhq0q53bj20ig0igq3y.jpg", 69 | "verified": false, 70 | "verified_type": -1, 71 | "mbtype": 11, 72 | "profile_url": "https:\/\/m.weibo.cn\/u\/3225859821?uid=3225859821", 73 | "remark": "" 74 | }, 75 | "text": "\u6709\u4e2a\u5c0f\u59d1\u5a18\u7167\u5230\u95ed\u773c\u775b\u4e86\uff0c\u540c\u60c5", 76 | "like_counts": 0, 77 | "liked": false 78 | }, 79 | { 80 | "id": 4172543885430014, 81 | "created_at": "2\u5206\u949f\u524d", 82 | "source": "\u5c0f\u7c73Max", 83 | "user": { 84 | "id": 6270899000, 85 | "screen_name": "\u601d\u5ff5Jill", 86 | "profile_image_url": "https:\/\/tvax1.sinaimg.cn\/crop.0.0.1002.1002.180\/006Qo3igly8fgceblxh2hj30ru0rugnt.jpg", 87 | "verified": false, 88 | "verified_type": -1, 89 | "mbtype": 11, 90 | "profile_url": "https:\/\/m.weibo.cn\/u\/6270899000?uid=6270899000", 91 | "remark": "" 92 | }, 93 | "text": "\u56de\u590d@iPanda\u718a\u732b\u9891\u9053<\/a>:\u54c8\u54c8\uff0c\u559c\u95fb\u4e50\u89c1\u7ffb\u8f66\u73b0\u573a\"[\u7b11\u800c\u4e0d\u8bed]\"<\/span>", 94 | "reply_id": 4172543135276210, 95 | "reply_text": "\u539f\u6765\u662f\u5927\u718a\u732b\u201c\u53e4\u53e4\u201d\u201c\u63a5\u89c1\u201d\u7684\u7279\u6717\u666e\u592b\u4eba\u554a~\"[\u54c8\u54c8]\"<\/span>\"[\u54c8\u54c8]\"<\/span>\"[\u54c8\u54c8]\"<\/span>", 96 | "like_counts": 0, 97 | "liked": false 98 | }, 99 | { 100 | "id": 4172543823262436, 101 | "created_at": "3\u5206\u949f\u524d", 102 | "source": "Android", 103 | "user": { 104 | "id": 3099343157, 105 | "screen_name": "\u6770\u6770ber", 106 | "profile_image_url": "https:\/\/tvax2.sinaimg.cn\/crop.0.0.960.960.180\/b8bc3935ly8fht7jimi3cj20qo0qojtw.jpg", 107 | "verified": false, 108 | "verified_type": -1, 109 | "mbtype": 2, 110 | "profile_url": "https:\/\/m.weibo.cn\/u\/3099343157?uid=3099343157", 111 | "remark": "" 112 | }, 113 | "text": "\u5b57\u6709\u70b9\u4f24\u5440", 114 | "like_counts": 0, 115 | "liked": false 116 | }, 117 | { 118 | "id": 4172543659142497, 119 | "created_at": "3\u5206\u949f\u524d", 120 | "source": "\u5356\u840c\u5170\u6362\u7684Android", 121 | "user": { 122 | "id": 3274546055, 123 | "screen_name": "\u61d2\u7c73\u7c738711", 124 | "profile_image_url": "https:\/\/tvax2.sinaimg.cn\/crop.0.0.996.996.180\/c32d9b87ly8feg3rclq14j20ro0ron0t.jpg", 125 | "verified": false, 126 | "verified_type": -1, 127 | "mbtype": 11, 128 | "profile_url": "https:\/\/m.weibo.cn\/u\/3274546055?uid=3274546055", 129 | "remark": "" 130 | }, 131 | "text": "\u56de\u590d@iPanda\u718a\u732b\u9891\u9053<\/a>:\u5976\u7238\u8bf4\u662f\u840c\u5c0f", 132 | "reply_id": 4172543135276210, 133 | "reply_text": "\u539f\u6765\u662f\u5927\u718a\u732b\u201c\u53e4\u53e4\u201d\u201c\u63a5\u89c1\u201d\u7684\u7279\u6717\u666e\u592b\u4eba\u554a~\"[\u54c8\u54c8]\"<\/span>\"[\u54c8\u54c8]\"<\/span>\"[\u54c8\u54c8]\"<\/span>", 134 | "like_counts": 0, 135 | "liked": false 136 | }, 137 | { 138 | "id": 4172543521028345, 139 | "created_at": "4\u5206\u949f\u524d", 140 | "source": "OPPO R9", 141 | "user": { 142 | "id": 2605943347, 143 | "screen_name": "\u732b\u4e0e\u69b4\u83b2\u7530baocl", 144 | "profile_image_url": "https:\/\/tvax4.sinaimg.cn\/crop.0.0.996.996.180\/9b538a33ly8fl7mj42xsqj20ro0rodi7.jpg", 145 | "verified": false, 146 | "verified_type": -1, 147 | "mbtype": 0, 148 | "profile_url": "https:\/\/m.weibo.cn\/u\/2605943347?uid=2605943347", 149 | "remark": "" 150 | }, 151 | "text": "\u56fe\u4e09\u6211\u60f3\u8981\u539f\u56fe\"[\u55b5\u55b5]\"<\/span>", 152 | "like_counts": 0, 153 | "liked": false 154 | }, 155 | { 156 | "id": 4172543504466751, 157 | "created_at": "4\u5206\u949f\u524d", 158 | "source": "\u5fae\u535a weibo.com", 159 | "user": { 160 | "id": 2393112124, 161 | "screen_name": "Iamnotafraid", 162 | "profile_image_url": "https:\/\/tva2.sinaimg.cn\/crop.0.0.180.180.180\/8ea3fe3cjw1e8qgp5bmzyj2050050aa8.jpg", 163 | "verified": false, 164 | "verified_type": -1, 165 | "mbtype": 0, 166 | "profile_url": "https:\/\/m.weibo.cn\/u\/2393112124?uid=2393112124", 167 | "remark": "" 168 | }, 169 | "text": "\u8fd9\u4e9b\u5b69\u5b50 \u90fd\u4e0d\u7528\u53bb\u4e0a\u5b66\u5417", 170 | "like_counts": 0, 171 | "liked": false 172 | }, 173 | { 174 | "id": 4172543478523580, 175 | "created_at": "4\u5206\u949f\u524d", 176 | "source": "iPhone\u5ba2\u6237\u7aef", 177 | "user": { 178 | "id": 3920727637, 179 | "screen_name": "\u6211\u53d6\u4ec0\u4e48\u540d\u5b57\u5173\u4f60\u5c41\u4e8b", 180 | "profile_image_url": "https:\/\/tvax1.sinaimg.cn\/crop.0.7.497.497.180\/e9b18e55ly8fj2a06llbvj20dt0e8wf1.jpg", 181 | "verified": false, 182 | "verified_type": -1, 183 | "mbtype": 2, 184 | "profile_url": "https:\/\/m.weibo.cn\/u\/3920727637?uid=3920727637", 185 | "remark": "" 186 | }, 187 | "text": "\u7ec8\u4e8e\u7b11\u4e86\uff0c\u8fd8\u662f\u6eda\u6eda\u5389\u5bb3\ud83d\udc3c", 188 | "like_counts": 0, 189 | "liked": false 190 | } 191 | ], 192 | "total_number": 27, 193 | "max": 3, 194 | "hot_data": [ 195 | { 196 | "id": 4172538495654634, 197 | "created_at": "24\u5206\u949f\u524d", 198 | "source": "\u5fae\u535a weibo.com", 199 | "user": { 200 | "id": 5838706073, 201 | "screen_name": "\u978b\u5382-\u6279\u53d1\u603b\u5e97", 202 | "profile_image_url": "https:\/\/tvax2.sinaimg.cn\/crop.204.174.600.600.180\/006n8CaBly8flctk2fzfbj30u00qoq6f.jpg", 203 | "verified": false, 204 | "verified_type": -1, 205 | "mbtype": 11, 206 | "profile_url": "https:\/\/m.weibo.cn\/u\/5838706073?uid=5838706073", 207 | "remark": "" 208 | }, 209 | "text": "\u4ed6\u4eec\u4e00\u5bb6\u7b7e\u540d\u90fd\u662f\u540c\u6b3e\u5b57\u4f53\u554a\"[\u7b11cry]\"<\/span>", 210 | "like_counts": 26, 211 | "liked": false 212 | }, 213 | { 214 | "id": 4172543135276210, 215 | "created_at": "5\u5206\u949f\u524d", 216 | "source": "\u5fae\u535a weibo.com", 217 | "user": { 218 | "id": 3222817584, 219 | "screen_name": "iPanda\u718a\u732b\u9891\u9053", 220 | "profile_image_url": "https:\/\/tvax4.sinaimg.cn\/crop.0.0.199.199.180\/c0184b30ly1fib5h4nawmj205k05kaaj.jpg", 221 | "verified": true, 222 | "verified_type": 3, 223 | "verified_type_ext": 0, 224 | "mbtype": 12, 225 | "profile_url": "https:\/\/m.weibo.cn\/u\/3222817584?uid=3222817584", 226 | "remark": "" 227 | }, 228 | "text": "\u539f\u6765\u662f\u5927\u718a\u732b\u201c\u53e4\u53e4\u201d\u201c\u63a5\u89c1\u201d\u7684\u7279\u6717\u666e\u592b\u4eba\u554a~\"[\u54c8\u54c8]\"<\/span>\"[\u54c8\u54c8]\"<\/span>\"[\u54c8\u54c8]\"<\/span>", 229 | "like_counts": 5, 230 | "liked": false 231 | }, 232 | { 233 | "id": 4172544283687415, 234 | "created_at": "1\u5206\u949f\u524d", 235 | "source": "\u9b45\u65cf PRO 5", 236 | "user": { 237 | "id": 2136636343, 238 | "screen_name": "\u7af9\u897fC", 239 | "profile_image_url": "https:\/\/tva3.sinaimg.cn\/crop.0.19.233.233.180\/7f5a7bb7jw1ealqnreu9oj206j08bmxf.jpg", 240 | "verified": false, 241 | "verified_type": -1, 242 | "mbtype": 0, 243 | "profile_url": "https:\/\/m.weibo.cn\/u\/2136636343?uid=2136636343", 244 | "remark": "" 245 | }, 246 | "text": "\u4e0d\u662f\u840c\u5c0f\u561b\"[\u5141\u60b2]\"<\/span>\"[\u5141\u60b2]\"<\/span>\"[\u5141\u60b2]\"<\/span>", 247 | "like_counts": 0, 248 | "liked": false 249 | }, 250 | { 251 | "id": 4172543521028345, 252 | "created_at": "4\u5206\u949f\u524d", 253 | "source": "OPPO R9", 254 | "user": { 255 | "id": 2605943347, 256 | "screen_name": "\u732b\u4e0e\u69b4\u83b2\u7530baocl", 257 | "profile_image_url": "https:\/\/tvax4.sinaimg.cn\/crop.0.0.996.996.180\/9b538a33ly8fl7mj42xsqj20ro0rodi7.jpg", 258 | "verified": false, 259 | "verified_type": -1, 260 | "mbtype": 0, 261 | "profile_url": "https:\/\/m.weibo.cn\/u\/2605943347?uid=2605943347", 262 | "remark": "" 263 | }, 264 | "text": "\u56fe\u4e09\u6211\u60f3\u8981\u539f\u56fe\"[\u55b5\u55b5]\"<\/span>", 265 | "like_counts": 0, 266 | "liked": false 267 | }, 268 | { 269 | "id": 4172539422399038, 270 | "created_at": "20\u5206\u949f\u524d", 271 | "source": "Weibo.intl", 272 | "user": { 273 | "id": 5510916469, 274 | "screen_name": "\u738b\u98ce\u9c7c", 275 | "profile_image_url": "https:\/\/tvax2.sinaimg.cn\/crop.0.0.736.736.180\/0060Xf7vly8fl4vru5lvmj30kg0kgab1.jpg", 276 | "verified": false, 277 | "verified_type": -1, 278 | "mbtype": 0, 279 | "profile_url": "https:\/\/m.weibo.cn\/u\/5510916469?uid=5510916469", 280 | "remark": "" 281 | }, 282 | "text": "\u5c0f\u5b69\u513f\u90fd\u4e0d\u7a7f\u68c9\u5927\u8863\u4e86\u3002\u7537\u751f\u7684\u5927\u80cc\u5934\u4e5f\u662f\u9ebb\u70e6\u53d1\u578b\u5e08\u4e86\u3002", 283 | "like_counts": 4, 284 | "liked": false 285 | }, 286 | { 287 | "id": 4172540798335754, 288 | "created_at": "15\u5206\u949f\u524d", 289 | "source": "\u5fae\u535a weibo.com", 290 | "user": { 291 | "id": 5581817548, 292 | "screen_name": "M0DA1YE", 293 | "profile_image_url": "https:\/\/tvax3.sinaimg.cn\/crop.173.0.639.639.180\/0065KJJily8ff7zxx617rj30sh0hrt9n.jpg", 294 | "verified": false, 295 | "verified_type": -1, 296 | "mbtype": 12, 297 | "profile_url": "https:\/\/m.weibo.cn\/u\/5581817548?uid=5581817548", 298 | "remark": "" 299 | }, 300 | "text": "\u7edf\u4e00\u670d\u88c5\uff1f\uff1f\uff1f", 301 | "like_counts": 2, 302 | "liked": false 303 | } 304 | ], 305 | "hot_total_number": 6 306 | } 307 | -------------------------------------------------------------------------------- /WeiboCommentSpider/weibo_search_result_example.json: -------------------------------------------------------------------------------- 1 | { 2 | "cardlistInfo": { 3 | "v_p": "42", 4 | "containerid": "100103type=2&q=\u7279\u6717\u666e", 5 | "title_top": "\u7279\u6717\u666e", 6 | "total": 1000, 7 | "show_style": 1, 8 | "starttime": 1510279670, 9 | "can_shared": 0, 10 | "cardlist_menus": [], 11 | "cardlist_head_cards": [], 12 | "toolbar_menus": [], 13 | "show_read_progress": null, 14 | "show_read_progress_stop": null, 15 | "page_size": 20, 16 | "page": 2 17 | }, 18 | "cards": [ 19 | { 20 | "card_type": 11, 21 | "show_type": 1, 22 | "card_group": [ 23 | { 24 | "card_type": 9, 25 | "card_type_name": "\u5fae\u535a", 26 | "itemid": "seqid:1156122784|type:2|t:|pos:1-0-0|q:\u7279\u6717\u666e|ext:&mid=4172477577867420&", 27 | "actionlog": { 28 | "act_code": 554, 29 | "ext": "seqid:1156122784|type:2|t:|pos:1-0-0|q:\u7279\u6717\u666e|ext:&mid=4172477577867420&", 30 | "luicode": "", 31 | "uicode": "", 32 | "fid": "100103type=2&q=\u7279\u6717\u666e" 33 | }, 34 | "display_arrow": 0, 35 | "show_type": 1, 36 | "mblog": { 37 | "created_at": "\u521a\u521a", 38 | "id": "4172477577867420", 39 | "mid": "4172477577867420", 40 | "idstr": "4172477577867420", 41 | "text": "\u3010\u7279\u6717\u666e\u4eca\u5929\u8bf4\u8bdd\u633a\u9760\u8c31[\u60a0\u95f2][\u5472\u7259]\u3011\u4e3b\u5e2d\u5148\u751f\uff1a\u975e\u5e38\u611f\u8c22\uff0c\u548c\u60a8\u5728\u4e00\u8d77\u5f88\u8363\u5e78\u3002\u4e2d\u7f8e\u5173\u7cfb\u662f\u975e\u5e38\u91cd\u8981\u7684\u8bdd\u9898\uff0c\u6d89\u53ca\u6211\u4eec\u53cc\u65b9\u4e5f\u5305\u62ec\u5176\u4ed6\u7684\u4e00\u4e9b\u56fd\u5bb6\u3002\u6211\u4eec\u76f8\u4fe1\uff0c\u4e2d\u7f8e\u6709\u80fd\u529b\u5728\u4eca\u540e\u89e3\u51b3\u4e16\u754c\u95ee\u9898\u3002\u6628\u5929\u665a\u4e0a\u7684\u4f1a\u6664\u662f\u975e\u5e38\u68d2\u7684\uff0c\u6211\u4eec\u7684\u665a\u9910\u65f6\u95f4\u8d85\u51fa\u4e86\u9884\u671f\u3002\u672c\u6765\u5b89\u6392\u4e8625\u5206\u949f\u7684\u665a\u9910\uff0c\u53ef\u4f60\u8fd9\u4e48\u53cb\u597d\uff0c\u665a\u5bb4\u6301\u7eed\u4e86 \u200b...\u5168\u6587<\/a>", 42 | "textLength": 1241, 43 | "source": "HUAWEI Mate 8", 44 | "favorited": false, 45 | "is_paid": false, 46 | "mblog_vip_type": 0, 47 | "user": { 48 | "id": 1248005561, 49 | "screen_name": "\u65b0\u56db\u541b\u5fae\u8584\u529b\u91cf", 50 | "profile_image_url": "https:\/\/tva1.sinaimg.cn\/crop.0.93.394.394.180\/4a630db9jw8ectj89qv12j20ay0g4ju7.jpg", 51 | "profile_url": "https:\/\/m.weibo.cn\/u\/1248005561?uid=1248005561&luicode=10000011&lfid=100103type%3D2%26q%3D%E7%89%B9%E6%9C%97%E6%99%AE&featurecode=20000320", 52 | "statuses_count": 49623, 53 | "verified": false, 54 | "verified_type": 220, 55 | "close_blue_v": false, 56 | "description": "\u6e29\u5ba4\u82b1\u6735\u53ea\u662f\u89c2\u8d4f\u690d\u7269\uff0c\u680b\u6881\u4e4b\u624d\u9700\u8981\u98ce\u5439\u96e8\u6c90\u3002\u73b0\u5b9e\u4e0d\u53ea\u6709\u7b11\u8fd8\u5e94\u6709\u54ed\uff0c\u7ecf\u5386\u548c\u632b\u6298\u662f\u5b9d\u8d35\u8d22\u5bcc\u3002", 57 | "gender": "m", 58 | "mbtype": 0, 59 | "urank": 40, 60 | "mbrank": 0, 61 | "follow_me": false, 62 | "following": false, 63 | "followers_count": 8031, 64 | "follow_count": 2230, 65 | "cover_image_phone": "https:\/\/tva3.sinaimg.cn\/crop.0.0.640.640.640\/6ce2240djw1e9odcin216j20hs0hstd8.jpg", 66 | "avatar_hd": "https:\/\/ww1.sinaimg.cn\/orj480\/4a630db9jw8ectj89qv12j20ay0g4ju7.jpg" 67 | }, 68 | "reposts_count": 0, 69 | "comments_count": 0, 70 | "attitudes_count": 0, 71 | "pending_approval_count": 0, 72 | "isLongText": true, 73 | "visible": { 74 | "type": 0, 75 | "list_id": 0 76 | }, 77 | "rid": "0_0_0_2676184252325337115", 78 | "mlevelSource": "monitor", 79 | "more_info_type": 0, 80 | "status": 0, 81 | "itemid": "seqid:1156122784|type:2|t:|pos:1-0-0|q:\u7279\u6717\u666e|ext:&mid=4172477577867420&", 82 | "page_info": { 83 | "page_pic": { 84 | "url": "https:\/\/ww4.sinaimg.cn\/large\/005BvWaMjw1eubrf1f94jj3050050mxb.jpg" 85 | }, 86 | "page_url": "https:\/\/m.weibo.cn\/p\/index?containerid=1001018008636068100000002&title=%2525E5%252595%252586%2525E5%25259C%252588&luicode=10000011&lfid=100103type%3D2%26q%3D%E7%89%B9%E6%9C%97%E6%99%AE&featurecode=20000320", 87 | "page_title": "\u8d35\u6eaa\u00b7\u9e70\u6f6d", 88 | "content1": "", 89 | "content2": "", 90 | "type": "webpage" 91 | }, 92 | "bid": "Fuptjx0FS" 93 | }, 94 | "scheme": "https:\/\/m.weibo.cn\/status\/Fuptjx0FS?mblogid=Fuptjx0FS&luicode=10000011&lfid=100103type%3D2%26q%3D%E7%89%B9%E6%9C%97%E6%99%AE&featurecode=20000320" 95 | }, 96 | { 97 | "card_type": 9, 98 | "card_type_name": "\u5fae\u535a", 99 | "itemid": "seqid:1156122784|type:2|t:|pos:1-0-1|q:\u7279\u6717\u666e|ext:&mid=4172477380932600&", 100 | "actionlog": { 101 | "act_code": 554, 102 | "ext": "seqid:1156122784|type:2|t:|pos:1-0-1|q:\u7279\u6717\u666e|ext:&mid=4172477380932600&", 103 | "luicode": "", 104 | "uicode": "", 105 | "fid": "100103type=2&q=\u7279\u6717\u666e" 106 | }, 107 | "display_arrow": 0, 108 | "show_type": 1, 109 | "mblog": { 110 | "created_at": "1\u5206\u949f\u524d", 111 | "id": "4172477380932600", 112 | "mid": "4172477380932600", 113 | "idstr": "4172477380932600", 114 | "text": "#\u7279\u6717\u666e\u8bbf\u534e#<\/a>#\u7279\u6717\u666e\u6765\u4e86#<\/a>\u7279\u6717\u666e\u8fd9\u4e00\u6ce2\uff0c\u53cc\u65b9\u6e05\u7a7a\u4e862535\u4ebf\u7f8e\u5143\u7684\u8d2d\u7269\u8f66\u3002\u6295\u8d44\u7c7b\uff1a\u897f\u5f17\u5409\u5c3c\u4e9a\u9875\u5ca9\u6c14\u5f00\u53d1\uff1a837\u4ebf\u7f8e\u5143\u9655\u897f\u6986\u6797\u7164\u6db2\u5316\u9879\u76ee\uff1a117\u4ebf\u7f8e\u5143\u963f\u62c9\u65af\u52a0\u6db2\u5316\u5929\u7136\u6c14\u5f00\u53d1\uff1a430\u4ebf\u7f8e\u5143\u6d88\u8d39\u7c7b\uff1a\u8d2d\u4e70\u7f8e\u56fd\u6db2\u5316\u5929\u7136\u6c14\uff1a110\u4ebf\u7f8e\u5143\u8d2d\u4e70\u7f8e\u56fd\u4e59\u70f7\uff1a260\u4ebf\u7f8e\u5143\u8d2d\u4e70\u6ce2\u97f3\u98de\u673a\uff1a370\u4ebf\u7f8e\u5143\u8d2d\u4e70\u5ba2\u673a\u53d1\u52a8\u673a\uff1a \u200b...\u5168\u6587<\/a>", 115 | "textLength": 487, 116 | "source": "Weibo.intl", 117 | "favorited": false, 118 | "thumbnail_pic": "http:\/\/wx1.sinaimg.cn\/thumbnail\/005OiyqRly1flcripbkpjj30go0g3js6.jpg", 119 | "bmiddle_pic": "http:\/\/wx1.sinaimg.cn\/bmiddle\/005OiyqRly1flcripbkpjj30go0g3js6.jpg", 120 | "original_pic": "http:\/\/wx1.sinaimg.cn\/large\/005OiyqRly1flcripbkpjj30go0g3js6.jpg", 121 | "is_paid": false, 122 | "mblog_vip_type": 0, 123 | "user": { 124 | "id": 5323903225, 125 | "screen_name": "\u9752\u6850K", 126 | "profile_image_url": "https:\/\/tvax3.sinaimg.cn\/crop.0.0.480.480.180\/005OiyqRly8fk8xegluatj30dc0dcgnv.jpg", 127 | "profile_url": "https:\/\/m.weibo.cn\/u\/5323903225?uid=5323903225&luicode=10000011&lfid=100103type%3D2%26q%3D%E7%89%B9%E6%9C%97%E6%99%AE&featurecode=20000320", 128 | "statuses_count": 13, 129 | "verified": false, 130 | "verified_type": -1, 131 | "close_blue_v": false, 132 | "description": "We are all in the gutter , but some of us are looking at the stars.", 133 | "gender": "m", 134 | "mbtype": 2, 135 | "urank": 14, 136 | "mbrank": 1, 137 | "follow_me": false, 138 | "following": false, 139 | "followers_count": 109, 140 | "follow_count": 168, 141 | "cover_image_phone": "https:\/\/tva1.sinaimg.cn\/crop.0.0.640.640.640\/6cf8d7ebjw1ehfr4xa8psj20hs0hsgpg.jpg", 142 | "avatar_hd": "https:\/\/wx3.sinaimg.cn\/orj480\/005OiyqRly8fk8xegluatj30dc0dcgnv.jpg" 143 | }, 144 | "reposts_count": 0, 145 | "comments_count": 0, 146 | "attitudes_count": 0, 147 | "pending_approval_count": 0, 148 | "isLongText": true, 149 | "visible": { 150 | "type": 0, 151 | "list_id": 0 152 | }, 153 | "rid": "1_0_0_2676184252325337115", 154 | "more_info_type": 0, 155 | "status": 0, 156 | "itemid": "seqid:1156122784|type:2|t:|pos:1-0-1|q:\u7279\u6717\u666e|ext:&mid=4172477380932600&", 157 | "page_info": { 158 | "page_pic": { 159 | "url": "https:\/\/wx2.sinaimg.cn\/thumbnail\/654b47daly1flawyuu3vgj2050050mxl.jpg" 160 | }, 161 | "page_url": "https:\/\/m.weibo.cn\/p\/index?containerid=100808e025355b5f4f2f6264fa66697f7c3139&extparam=%E7%89%B9%E6%9C%97%E6%99%AE%E8%AE%BF%E5%8D%8E&luicode=10000011&lfid=100103type%3D2%26q%3D%E7%89%B9%E6%9C%97%E6%99%AE&featurecode=20000320", 162 | "page_title": "#\u7279\u6717\u666e\u8bbf\u534e#", 163 | "content1": "11\u67088\u65e5\u81f310\u65e5\uff0c\u7b2c\u4e00\u65f6\u95f4\u64ad\u62a5", 164 | "content2": "462\u4eba\u5173\u6ce8", 165 | "type": "topic" 166 | }, 167 | "bid": "Fupt03UBW", 168 | "pics": [ 169 | { 170 | "pid": "005OiyqRly1flcripbkpjj30go0g3js6", 171 | "url": "https:\/\/wx1.sinaimg.cn\/orj360\/005OiyqRly1flcripbkpjj30go0g3js6.jpg", 172 | "size": "orj360", 173 | "geo": { 174 | "width": 279, 175 | "height": 270, 176 | "croped": false 177 | }, 178 | "large": { 179 | "size": "large", 180 | "url": "https:\/\/wx1.sinaimg.cn\/large\/005OiyqRly1flcripbkpjj30go0g3js6.jpg", 181 | "geo": { 182 | "width": "600", 183 | "height": "579", 184 | "croped": false 185 | } 186 | } 187 | } 188 | ] 189 | }, 190 | "scheme": "https:\/\/m.weibo.cn\/status\/Fupt03UBW?mblogid=Fupt03UBW&luicode=10000011&lfid=100103type%3D2%26q%3D%E7%89%B9%E6%9C%97%E6%99%AE&featurecode=20000320" 191 | }, 192 | { 193 | "card_type": 9, 194 | "card_type_name": "\u5fae\u535a", 195 | "itemid": "seqid:1156122784|type:2|t:|pos:1-0-2|q:\u7279\u6717\u666e|ext:&mid=4172476239861040&", 196 | "actionlog": { 197 | "act_code": 554, 198 | "ext": "seqid:1156122784|type:2|t:|pos:1-0-2|q:\u7279\u6717\u666e|ext:&mid=4172476239861040&", 199 | "luicode": "", 200 | "uicode": "", 201 | "fid": "100103type=2&q=\u7279\u6717\u666e" 202 | }, 203 | "display_arrow": 0, 204 | "show_type": 1, 205 | "mblog": { 206 | "created_at": "5\u5206\u949f\u524d", 207 | "id": "4172476239861040", 208 | "mid": "4172476239861040", 209 | "idstr": "4172476239861040", 210 | "text": "#\u7279\u6717\u666e\u7ed3\u675f\u8bbf\u534e#<\/a>\u53bbAPEC\uff0c#\u4e2d\u7f8e2535\u4ebf\u5927\u5355#<\/a>\u5305\u62ec\u54ea\u4e9b\uff1f1\u3001\u4e2d\u822a\u6750\u4e0e\u6ce2\u97f3\u534f\u8bae370\u4ebf2\u3001\u901a\u7528\u7535\u6c1435\u4ebf3\u3001\u9ad8\u901a120\u4ebf4\u3001\u4e2d\u6838\u96c6\u56e2\u4e0e\u7f8e\u56fd\u897f\u5c4b\u7535\u6c14\u516c\u53f8\u5546\u8ba8\u5408\u4f5c\u4e8b\u5b9c5\u3001\u4e2d\u6295\u516c\u53f8\u4e0e\u9ad8\u76db\u96c6\u56e2\u5408\u4f5c\u57fa\u91d1\u91d1\u989d50\u4ebf6\u3001\u6295\u8d44\u963f\u62c9\u65af\u52a0\u5dde\u5f00\u53d1\u6db2\u5316\u5929\u7136\u6c14430\u4ebf7\u3001\u8fdb\u53e31200\u4e07\u5428\u5927\u8c4650\u4ebf8\u3001\u897f\u5f17\u5dde\u6295\u8d44837\u4ebf9\u3001UOP2.2\u4ebf10 \u200b...\u5168\u6587<\/a>", 211 | "textLength": 778, 212 | "source": "\u5fae\u535a weibo.com", 213 | "favorited": false, 214 | "thumbnail_pic": "http:\/\/wx1.sinaimg.cn\/thumbnail\/79442f45ly1flcrd3b7pxj20dw0afjrn.jpg", 215 | "bmiddle_pic": "http:\/\/wx1.sinaimg.cn\/bmiddle\/79442f45ly1flcrd3b7pxj20dw0afjrn.jpg", 216 | "original_pic": "http:\/\/wx1.sinaimg.cn\/large\/79442f45ly1flcrd3b7pxj20dw0afjrn.jpg", 217 | "is_paid": false, 218 | "mblog_vip_type": 0, 219 | "user": { 220 | "id": 2034511685, 221 | "screen_name": "\u6c88\u9633\u73af\u7403\u6559\u80b2_", 222 | "profile_image_url": "https:\/\/tvax1.sinaimg.cn\/crop.11.12.372.372.180\/79442f45ly8fjovanslgdj20ap0ap3zi.jpg", 223 | "profile_url": "https:\/\/m.weibo.cn\/u\/2034511685?uid=2034511685&luicode=10000011&lfid=100103type%3D2%26q%3D%E7%89%B9%E6%9C%97%E6%99%AE&featurecode=20000320", 224 | "statuses_count": 1705, 225 | "verified": true, 226 | "verified_type": 2, 227 | "verified_type_ext": 0, 228 | "verified_reason": "\u6c88\u9633\u73af\u7403\u96c5\u601d\u57f9\u8bad\u5b66\u6821", 229 | "close_blue_v": false, 230 | "description": "\u73af\u7403\u6559\u80b2\u662f\u5168\u56fd\u6700\u5927\u7684\u96c5\u601d\u9ad8\u5206\u57f9\u8bad\u57fa\u5730\uff0c\u6210\u4e3a\u5168\u7403\u7b2c\u4e00\u5bb6\u8de8\u56fd\u6559\u80b2\u4e0a\u5e02\u4f01\u4e1a\u3002\u6bcf\u6708\u5f00\u8bbeVIP\u79c1\u4eba\u5b9a\u5236\u73ed\u30013\u4eba\uff0c6\u4eba\uff0c10\u4eba\u73ed\u3002\u54a8\u8be2\u7535\u8bdd\uff1a400 900 3013", 231 | "gender": "m", 232 | "mbtype": 0, 233 | "urank": 20, 234 | "mbrank": 0, 235 | "follow_me": false, 236 | "following": false, 237 | "followers_count": 1388, 238 | "follow_count": 417, 239 | "cover_image_phone": "https:\/\/tva1.sinaimg.cn\/crop.0.0.640.640.640\/549d0121tw1egm1kjly3jj20hs0hsq4f.jpg", 240 | "avatar_hd": "https:\/\/wx1.sinaimg.cn\/orj480\/79442f45ly8fjovanslgdj20ap0ap3zi.jpg" 241 | }, 242 | "reposts_count": 0, 243 | "comments_count": 0, 244 | "attitudes_count": 0, 245 | "pending_approval_count": 0, 246 | "isLongText": true, 247 | "visible": { 248 | "type": 0, 249 | "list_id": 0 250 | }, 251 | "rid": "3_0_0_2676184252325337115", 252 | "more_info_type": 0, 253 | "status": 0, 254 | "itemid": "seqid:1156122784|type:2|t:|pos:1-0-2|q:\u7279\u6717\u666e|ext:&mid=4172476239861040&", 255 | "page_info": { 256 | "page_pic": { 257 | "url": "https:\/\/tva4.sinaimg.cn\/crop.0.0.2780.1566\/90eb2137ly1fl41rhctqsj225c17iwzv.jpg" 258 | }, 259 | "page_url": "https:\/\/m.weibo.cn\/p\/index?containerid=23137500007546610938550273&luicode=10000011&lfid=100103type%3D2%26q%3D%E7%89%B9%E6%9C%97%E6%99%AE&featurecode=20000320", 260 | "page_title": "\u3010\u65b0\u9c9c\u4e8b\u3011\u7279\u6717\u666e\u4e9a\u6d32\u884c", 261 | "content1": "", 262 | "content2": "", 263 | "type": "webpage" 264 | }, 265 | "bid": "Fupr9Fnj2", 266 | "pics": [ 267 | { 268 | "pid": "79442f45ly1flcrd3b7pxj20dw0afjrn", 269 | "url": "https:\/\/wx1.sinaimg.cn\/orj360\/79442f45ly1flcrd3b7pxj20dw0afjrn.jpg", 270 | "size": "orj360", 271 | "geo": { 272 | "width": 360, 273 | "height": 270, 274 | "croped": false 275 | }, 276 | "large": { 277 | "size": "large", 278 | "url": "https:\/\/wx1.sinaimg.cn\/large\/79442f45ly1flcrd3b7pxj20dw0afjrn.jpg", 279 | "geo": { 280 | "width": "500", 281 | "height": "375", 282 | "croped": false 283 | } 284 | } 285 | } 286 | ] 287 | }, 288 | "scheme": "https:\/\/m.weibo.cn\/status\/Fupr9Fnj2?mblogid=Fupr9Fnj2&luicode=10000011&lfid=100103type%3D2%26q%3D%E7%89%B9%E6%9C%97%E6%99%AE&featurecode=20000320" 289 | }, 290 | { 291 | "card_type": 9, 292 | "card_type_name": "\u5fae\u535a", 293 | "itemid": "seqid:1156122784|type:2|t:|pos:1-0-3|q:\u7279\u6717\u666e|ext:&mid=4172476231452360&", 294 | "actionlog": { 295 | "act_code": 554, 296 | "ext": "seqid:1156122784|type:2|t:|pos:1-0-3|q:\u7279\u6717\u666e|ext:&mid=4172476231452360&", 297 | "luicode": "", 298 | "uicode": "", 299 | "fid": "100103type=2&q=\u7279\u6717\u666e" 300 | }, 301 | "display_arrow": 0, 302 | "show_type": 1, 303 | "mblog": { 304 | "created_at": "5\u5206\u949f\u524d", 305 | "id": "4172476231452360", 306 | "mid": "4172476231452360", 307 | "idstr": "4172476231452360", 308 | "text": "\u5ddd\u666e \u7279\u6717\u666e \u90fd\u6210\u654f\u611f\u8bcd\u4e86\u3002 \u200b", 309 | "textLength": 26, 310 | "source": "\u5fae\u535a weibo.com", 311 | "favorited": false, 312 | "is_paid": false, 313 | "mblog_vip_type": 0, 314 | "user": { 315 | "id": 2299119915, 316 | "screen_name": "\u62e5\u62a4\u4f1f\u5927\u9886\u8896", 317 | "profile_image_url": "https:\/\/tvax3.sinaimg.cn\/crop.192.126.481.481.180\/8909c92bly8fjnskl84c9j20o20kejt4.jpg", 318 | "profile_url": "https:\/\/m.weibo.cn\/u\/2299119915?uid=2299119915&luicode=10000011&lfid=100103type%3D2%26q%3D%E7%89%B9%E6%9C%97%E6%99%AE&featurecode=20000320", 319 | "statuses_count": 431, 320 | "verified": false, 321 | "verified_type": -1, 322 | "close_blue_v": false, 323 | "description": "\u4eba\u95f4\u6b63\u9053\u662f\u6ca7\u6851\u3002", 324 | "gender": "m", 325 | "mbtype": 2, 326 | "urank": 9, 327 | "mbrank": 1, 328 | "follow_me": false, 329 | "following": false, 330 | "followers_count": 170, 331 | "follow_count": 352, 332 | "cover_image_phone": "https:\/\/tva3.sinaimg.cn\/crop.0.0.640.640.640\/68f96449tw1egwcah85a8j20hs0hsdic.jpg", 333 | "avatar_hd": "https:\/\/wx3.sinaimg.cn\/orj480\/8909c92bly8fjnskl84c9j20o20kejt4.jpg" 334 | }, 335 | "reposts_count": 0, 336 | "comments_count": 0, 337 | "attitudes_count": 0, 338 | "pending_approval_count": 0, 339 | "isLongText": false, 340 | "visible": { 341 | "type": 0, 342 | "list_id": 0 343 | }, 344 | "rid": "4_0_0_2676184252325337115", 345 | "more_info_type": 0, 346 | "status": 0, 347 | "itemid": "seqid:1156122784|type:2|t:|pos:1-0-3|q:\u7279\u6717\u666e|ext:&mid=4172476231452360&", 348 | "bid": "Fupr965Pa" 349 | }, 350 | "scheme": "https:\/\/m.weibo.cn\/status\/Fupr965Pa?mblogid=Fupr965Pa&luicode=10000011&lfid=100103type%3D2%26q%3D%E7%89%B9%E6%9C%97%E6%99%AE&featurecode=20000320" 351 | }, 352 | { 353 | "card_type": 9, 354 | "card_type_name": "\u5fae\u535a", 355 | "itemid": "seqid:1156122784|type:2|t:|pos:1-0-4|q:\u7279\u6717\u666e|ext:&mid=4172476159858375&", 356 | "actionlog": { 357 | "act_code": 554, 358 | "ext": "seqid:1156122784|type:2|t:|pos:1-0-4|q:\u7279\u6717\u666e|ext:&mid=4172476159858375&", 359 | "luicode": "", 360 | "uicode": "", 361 | "fid": "100103type=2&q=\u7279\u6717\u666e" 362 | }, 363 | "display_arrow": 0, 364 | "show_type": 1, 365 | "mblog": { 366 | "created_at": "6\u5206\u949f\u524d", 367 | "id": "4172476159858375", 368 | "mid": "4172476159858375", 369 | "idstr": "4172476159858375", 370 | "text": "\u7279\u6717\u666e\uff1a\u672c\u676525\u5206\u949f\u7684\u665a\u9910\u6301\u7eed2\u5c0f\u65f6 \u975e\u5e38\u4eab\u53d7 <\/span><\/i>\u7279\u6717\u666e\uff1a\u672c\u676525\u5206\u949f\u7684\u665a\u9910\u6301\u7eed2\u5c0f\u65f6 \u975e\u5e38\u4eab\u53d7<\/a> \u200b", 371 | "textLength": 62, 372 | "source": "ZAKER\u624e\u5ba2Android\u7248", 373 | "favorited": false, 374 | "is_paid": false, 375 | "mblog_vip_type": 0, 376 | "user": { 377 | "id": 1709297804, 378 | "screen_name": "J-\u6a3e\u7498", 379 | "profile_image_url": "https:\/\/tva3.sinaimg.cn\/crop.0.0.180.180.180\/65e1d08cjw1e8qgp5bmzyj2050050aa8.jpg", 380 | "profile_url": "https:\/\/m.weibo.cn\/u\/1709297804?uid=1709297804&luicode=10000011&lfid=100103type%3D2%26q%3D%E7%89%B9%E6%9C%97%E6%99%AE&featurecode=20000320", 381 | "statuses_count": 5235, 382 | "verified": false, 383 | "verified_type": 220, 384 | "close_blue_v": false, 385 | "description": "\u5b9e\u4e4b\u534e\u4e4b\u5179\u4e43\u517c\u6c42\uff0c\u987a\u98ce\u516e\u9006\u98ce\u516e\u65e0\u963b\u6211\u98de\u626c\u3002", 386 | "gender": "f", 387 | "mbtype": 0, 388 | "urank": 35, 389 | "mbrank": 0, 390 | "follow_me": false, 391 | "following": false, 392 | "followers_count": 1562, 393 | "follow_count": 374, 394 | "cover_image_phone": "https:\/\/tva1.sinaimg.cn\/crop.0.0.640.640.640\/549d0121tw1egm1kjly3jj20hs0hsq4f.jpg", 395 | "avatar_hd": "https:\/\/ww3.sinaimg.cn\/orj480\/65e1d08cjw1e8qgp5bmzyj2050050aa8.jpg" 396 | }, 397 | "reposts_count": 0, 398 | "comments_count": 0, 399 | "attitudes_count": 0, 400 | "pending_approval_count": 0, 401 | "isLongText": false, 402 | "visible": { 403 | "type": 0, 404 | "list_id": 0 405 | }, 406 | "rid": "5_0_0_2676184252325337115", 407 | "more_info_type": 0, 408 | "status": 0, 409 | "itemid": "seqid:1156122784|type:2|t:|pos:1-0-4|q:\u7279\u6717\u666e|ext:&mid=4172476159858375&", 410 | "page_info": { 411 | "page_pic": { 412 | "url": "http:\/\/zkres.myzaker.com\/data\/ads_web\/share_pic.png" 413 | }, 414 | "page_url": "http:\/\/weibo.cn\/sinaurl\/blocked295ca5d8?url=http%3A%2F%2Fiphone.myzaker.com%2Fl.php%3Fl%3D5a03e91d9490cbd97b000031&sinainternalbrowser=topnav&share_menu=1&url_type=39&object_type=webpage&pos=2&luicode=10000011&lfid=100103type%3D2%26q%3D%E7%89%B9%E6%9C%97%E6%99%AE&featurecode=20000320&u=http%3A%2F%2Fiphone.myzaker.com%2Fl.php%3Fl%3D5a03e91d9490cbd97b000031", 415 | "page_title": "\u7279\u6717\u666e\uff1a\u672c\u676525\u5206\u949f\u7684\u665a\u9910\u6301\u7eed2\u5c0f\u65f6 \u975e\u5e38\u4eab\u53d7", 416 | "content1": "\u7279\u6717\u666e\uff1a\u672c\u676525\u5206\u949f\u7684\u665a\u9910\u6301\u7eed2\u5c0f\u65f6 \u975e\u5e38\u4eab\u53d7", 417 | "content2": "\u4eca\u5929(11\u67089\u65e5)\u4e0a\u5348\uff0c\u56fd\u5bb6\u4e3b\u5e2d\u4e60\u8fd1\u5e73\u4e0e\u7f8e\u56fd\u603b\u7edf\u7279\u6717\u666e\u5728\u4eba\u6c11\u5927\u4f1a\u5802\u4e3e\u884c\u4f1a\u8c08\u3002\u7f8e\u56fd\u603b\u7edf\u7279\u6717\u666e\uff1a\u4e3b\u5e2d\u5148\u751f\uff0c\u975e\u5e38\u611f\u8c22\uff0c\u548c\u60a8\u5728\u4e00\u8d77\u5f88\u8363\u5e78\u3002\u4e2d\u7f8e\u5173\u7cfb\u662f\u975e\u5e38\u91cd\u8981\u7684\u8bdd\u9898\uff0c\u6d89\u53ca\u6211\u4eec\u53cc\u65b9\u4e5f\u5305\u62ec\u5176\u4ed6\u4e00\u4e9b\u56fd\u5bb6\u3002\u6211\u4eec\u76f8\u4fe1\uff0c\u4e2d\u7f8e\u6709\u80fd\u529b\u5728\u4eca\u540e\u89e3\u51b3\u4e16\u754c\u95ee\u9898\u3002\u6628\u5929\u665a\u4e0a\u7684\u4f1a\u6664\u662f\u975e\u5e38\u68d2\u7684\uff0c\u6211\u4eec\u7684\u665a\u9910\u65f6\u95f4\u8d85\u51fa\u4e86\u9884\u671f\u3002\u672c\u6765\u5b89\u6392\u4e8625\u5206\u949f\u7684\u665a\u9910\uff0c\u53ef\u4f60\u8fd9\u4e48\u53cb\u597d\uff0c\u665a\u5bb4\u6301\u7eed\u4e86\u81f3\u5c11\u4e24\u4e2a\u5c0f\u65f6\u3002\u548c\u60a8\u548c\u60a8\u7684\u592b\u4eba\u4e00\u8d77\uff0c\u6bcf\u4e00\u5206\u949f\u6211\u4eec\u90fd\u975e\u5e38\u4eab\u53d7\u3002\u6211\u4eec\u7684\u5173\u7cfb\u662f...", 418 | "type": "webpage" 419 | }, 420 | "bid": "Fupr1FmC3" 421 | }, 422 | "scheme": "https:\/\/m.weibo.cn\/status\/Fupr1FmC3?mblogid=Fupr1FmC3&luicode=10000011&lfid=100103type%3D2%26q%3D%E7%89%B9%E6%9C%97%E6%99%AE&featurecode=20000320" 423 | }, 424 | { 425 | "card_type": 9, 426 | "card_type_name": "\u5fae\u535a", 427 | "itemid": "seqid:1156122784|type:2|t:|pos:1-0-5|q:\u7279\u6717\u666e|ext:&mid=4172476063590165&", 428 | "actionlog": { 429 | "act_code": 554, 430 | "ext": "seqid:1156122784|type:2|t:|pos:1-0-5|q:\u7279\u6717\u666e|ext:&mid=4172476063590165&", 431 | "luicode": "", 432 | "uicode": "", 433 | "fid": "100103type=2&q=\u7279\u6717\u666e" 434 | }, 435 | "display_arrow": 0, 436 | "show_type": 1, 437 | "mblog": { 438 | "created_at": "6\u5206\u949f\u524d", 439 | "id": "4172476063590165", 440 | "mid": "4172476063590165", 441 | "idstr": "4172476063590165", 442 | "text": "\u7279\u6717\u666e\u9996\u6b21\u8bbf\u534e - \u6211\u6b63\u5728\u770b\u4e13\u9898\uff1a\u300a\u7279\u6717\u666e\u9996\u6b21\u8bbf\u534e \u5df2\u62b5\u8fbe\u9996\u90fd\u673a\u573a\u300b \u7f8e\u56fd\u8fd8\u6b20\u4e2d\u56fd\u7684\u503a\u52a16\u5343\u591a\u4ebf\u5143\uff0c\u4e3a\u4f55\u8fd8\u8981\u548c\u5b83\u7b7e\u8ba22\u5343\u591a\u4ebf\u9879\u76ee\u5408\u540c\uff0c\u4e0d\u6015\u7f8e\u56fd\u4eba\u8fd8\u4e0d\u8d77\u5417 <\/span><\/i>\u7279\u6717\u666e\u9996\u6b21\u8bbf\u534e<\/a> \u200b", 443 | "textLength": 166, 444 | "source": "bShare\u5206\u4eab", 445 | "favorited": false, 446 | "is_paid": false, 447 | "mblog_vip_type": 0, 448 | "user": { 449 | "id": 5602219262, 450 | "screen_name": "\u5c0f\u8bf4\u5bb675152", 451 | "profile_image_url": "https:\/\/tva2.sinaimg.cn\/crop.1.45.636.636.180\/00678l9cgw1f0phvt4fsjj30hs0np0w1.jpg", 452 | "profile_url": "https:\/\/m.weibo.cn\/u\/5602219262?uid=5602219262&luicode=10000011&lfid=100103type%3D2%26q%3D%E7%89%B9%E6%9C%97%E6%99%AE&featurecode=20000320", 453 | "statuses_count": 11882, 454 | "verified": false, 455 | "verified_type": -1, 456 | "close_blue_v": false, 457 | "description": "\u4e00\u4e2a\u4e0d\u7518\u5bc2\u5bde\u662f\u4f60\u4e0d\u611f\u5174\u8da3\u800c\u52aa\u529b\u8ffd\u5bfb\u81ea\u5df1\u68a6\u5883\u7684\u4eba", 458 | "gender": "m", 459 | "mbtype": 0, 460 | "urank": 32, 461 | "mbrank": 0, 462 | "follow_me": false, 463 | "following": false, 464 | "followers_count": 1156, 465 | "follow_count": 1998, 466 | "cover_image_phone": "https:\/\/tva1.sinaimg.cn\/crop.0.0.640.640.640\/549d0121tw1egm1kjly3jj20hs0hsq4f.jpg", 467 | "avatar_hd": "https:\/\/ww2.sinaimg.cn\/orj480\/00678l9cgw1f0phvt4fsjj30hs0np0w1.jpg" 468 | }, 469 | "reposts_count": 0, 470 | "comments_count": 0, 471 | "attitudes_count": 0, 472 | "pending_approval_count": 0, 473 | "isLongText": false, 474 | "visible": { 475 | "type": 0, 476 | "list_id": 0 477 | }, 478 | "rid": "6_0_0_2676184252325337115", 479 | "more_info_type": 0, 480 | "status": 0, 481 | "itemid": "seqid:1156122784|type:2|t:|pos:1-0-5|q:\u7279\u6717\u666e|ext:&mid=4172476063590165&", 482 | "page_info": { 483 | "page_pic": { 484 | "url": "https:\/\/tva4.sinaimg.cn\/crop.0.0.2780.1566\/90eb2137ly1fl41rhctqsj225c17iwzv.jpg" 485 | }, 486 | "page_url": "https:\/\/m.weibo.cn\/p\/index?containerid=23137500007546610938550273&luicode=10000011&lfid=100103type%3D2%26q%3D%E7%89%B9%E6%9C%97%E6%99%AE&featurecode=20000320", 487 | "page_title": "\u3010\u65b0\u9c9c\u4e8b\u3011\u7279\u6717\u666e\u4e9a\u6d32\u884c", 488 | "content1": "", 489 | "content2": "", 490 | "type": "webpage" 491 | }, 492 | "bid": "FupqSf3XT" 493 | }, 494 | "scheme": "https:\/\/m.weibo.cn\/status\/FupqSf3XT?mblogid=FupqSf3XT&luicode=10000011&lfid=100103type%3D2%26q%3D%E7%89%B9%E6%9C%97%E6%99%AE&featurecode=20000320" 495 | }, 496 | { 497 | "card_type": 9, 498 | "card_type_name": "\u5fae\u535a", 499 | "itemid": "seqid:1156122784|type:2|t:|pos:1-0-6|q:\u7279\u6717\u666e|ext:&mid=4172476042870320&", 500 | "actionlog": { 501 | "act_code": 554, 502 | "ext": "seqid:1156122784|type:2|t:|pos:1-0-6|q:\u7279\u6717\u666e|ext:&mid=4172476042870320&", 503 | "luicode": "", 504 | "uicode": "", 505 | "fid": "100103type=2&q=\u7279\u6717\u666e" 506 | }, 507 | "display_arrow": 0, 508 | "show_type": 1, 509 | "mblog": { 510 | "created_at": "6\u5206\u949f\u524d", 511 | "id": "4172476042870320", 512 | "mid": "4172476042870320", 513 | "idstr": "4172476042870320", 514 | "text": "\u3010\u65e9\u5b89\u301111\u670810\u65e5\u65f6\u653f\u70ed\u70b9\u77e5\u8bc6\u79ef\u7d2f\uff1a<\/span><\/i>\u7f51\u9875\u94fe\u63a5<\/a>\u3000\"[\u5403\u74dc]\"<\/span>\u8981\u95fb\uff1a9\u65e5\uff0c\u4e60\u8fd1\u5e73\u5728\u4eba\u6c11\u5927\u4f1a\u5802\u540c\u7f8e\u56fd\u603b\u7edf\u7279\u6717\u666e\u4e3e\u884c\u4f1a\u8c08\u3002\u4f1a\u8c08\u4e2d\uff0c\u4e24\u56fd\u5143\u9996\u5c31\u52a0\u5f3a\u4e2d\u7f8e\u53cc\u8fb9\u3001\u5730\u533a\u548c\u5168\u7403\u5c42\u9762\u5408\u4f5c\u8fbe\u6210\u591a\u9879\u91cd\u8981\u6210\u679c\u548c\u5171\u8bc6\u3002 \u200b", 515 | "textLength": 190, 516 | "source": "\u5fae\u535a weibo.com", 517 | "favorited": false, 518 | "thumbnail_pic": "http:\/\/wx2.sinaimg.cn\/thumbnail\/e608a12cly1flcrd5fh2qj20j60y3jtk.jpg", 519 | "bmiddle_pic": "http:\/\/wx2.sinaimg.cn\/bmiddle\/e608a12cly1flcrd5fh2qj20j60y3jtk.jpg", 520 | "original_pic": "http:\/\/wx2.sinaimg.cn\/large\/e608a12cly1flcrd5fh2qj20j60y3jtk.jpg", 521 | "is_paid": false, 522 | "mblog_vip_type": 0, 523 | "user": { 524 | "id": 3859325228, 525 | "screen_name": "\u54c8\u5c14\u6ee8\u534e\u56fe", 526 | "profile_image_url": "https:\/\/tva2.sinaimg.cn\/crop.0.0.180.180.180\/e608a12cjw8eswds55qnoj20500500sr.jpg", 527 | "profile_url": "https:\/\/m.weibo.cn\/u\/3859325228?uid=3859325228&luicode=10000011&lfid=100103type%3D2%26q%3D%E7%89%B9%E6%9C%97%E6%99%AE&featurecode=20000320", 528 | "statuses_count": 2360, 529 | "verified": true, 530 | "verified_type": 2, 531 | "verified_type_ext": 0, 532 | "verified_reason": "\u5317\u4eac\u534e\u56fe\u5b8f\u9633\u6559\u80b2\u6587\u5316\u53d1\u5c55\u80a1\u4efd\u6709\u9650\u516c\u53f8\u54c8\u5c14\u6ee8\u5206\u516c\u53f8", 533 | "close_blue_v": false, 534 | "description": "\u9ed1\u9f99\u6c5f\u534e\u56fe\u6559\u80b2", 535 | "gender": "m", 536 | "mbtype": 0, 537 | "urank": 14, 538 | "mbrank": 0, 539 | "follow_me": false, 540 | "following": false, 541 | "followers_count": 12058, 542 | "follow_count": 181, 543 | "cover_image_phone": "https:\/\/tva1.sinaimg.cn\/crop.0.0.640.640.640\/549d0121tw1egm1kjly3jj20hs0hsq4f.jpg", 544 | "avatar_hd": "https:\/\/ww2.sinaimg.cn\/orj480\/e608a12cjw8eswds55qnoj20500500sr.jpg" 545 | }, 546 | "reposts_count": 0, 547 | "comments_count": 0, 548 | "attitudes_count": 0, 549 | "pending_approval_count": 0, 550 | "isLongText": false, 551 | "visible": { 552 | "type": 0, 553 | "list_id": 0 554 | }, 555 | "rid": "7_0_0_2676184252325337115", 556 | "mlevelSource": "monitor", 557 | "more_info_type": 0, 558 | "status": 0, 559 | "itemid": "seqid:1156122784|type:2|t:|pos:1-0-6|q:\u7279\u6717\u666e|ext:&mid=4172476042870320&", 560 | "bid": "FupqQc2Hu", 561 | "pics": [ 562 | { 563 | "pid": "e608a12cly1flcrd5fh2qj20j60y3jtk", 564 | "url": "https:\/\/wx2.sinaimg.cn\/orj360\/e608a12cly1flcrd5fh2qj20j60y3jtk.jpg", 565 | "size": "orj360", 566 | "geo": { 567 | "width": 360, 568 | "height": 640, 569 | "croped": false 570 | }, 571 | "large": { 572 | "size": "large", 573 | "url": "https:\/\/wx2.sinaimg.cn\/large\/e608a12cly1flcrd5fh2qj20j60y3jtk.jpg", 574 | "geo": { 575 | "width": "690", 576 | "height": "1227", 577 | "croped": false 578 | } 579 | } 580 | } 581 | ] 582 | }, 583 | "scheme": "https:\/\/m.weibo.cn\/status\/FupqQc2Hu?mblogid=FupqQc2Hu&luicode=10000011&lfid=100103type%3D2%26q%3D%E7%89%B9%E6%9C%97%E6%99%AE&featurecode=20000320" 584 | }, 585 | { 586 | "card_type": 9, 587 | "card_type_name": "\u5fae\u535a", 588 | "itemid": "seqid:1156122784|type:2|t:|pos:1-0-7|q:\u7279\u6717\u666e|ext:&mid=4172475753600035&", 589 | "actionlog": { 590 | "act_code": 554, 591 | "ext": "seqid:1156122784|type:2|t:|pos:1-0-7|q:\u7279\u6717\u666e|ext:&mid=4172475753600035&", 592 | "luicode": "", 593 | "uicode": "", 594 | "fid": "100103type=2&q=\u7279\u6717\u666e" 595 | }, 596 | "display_arrow": 0, 597 | "show_type": 1, 598 | "mblog": { 599 | "created_at": "7\u5206\u949f\u524d", 600 | "id": "4172475753600035", 601 | "mid": "4172475753600035", 602 | "idstr": "4172475753600035", 603 | "text": "\u7279\u6717\u666e10\u65e5\u7ed3\u675f\u5bf9\u4e2d\u56fd\u7684\u56fd\u4e8b\u8bbf\u95ee \u4e58\u4e13\u673a\u79bb\u5f00\u5317\u4eac_\u7f51\u6613\u65b0\u95fb \uff08\u5206\u4eab\u81ea @\u7f51\u6613\u65b0\u95fb<\/a>\uff09 #sns_weibo#<\/a>sns_weibo&ep=Fupqnf6x5%2C2978480200%2CFupqnf6x5%2C2978480200\" class=\"\"><\/span><\/i>\u7279\u6717\u666e10\u65e5\u7ed3\u675f\u5bf9\u4e2d\u56fd\u7684\u56fd\u4e8b\u8bbf\u95ee \u4e58\u4e13\u673a\u79bb\u5f00\u5317\u4eac_\u7f51\u6613\u65b0\u95fb<\/a> \u200b", 604 | "textLength": 95, 605 | "source": "\u7f51\u6613\u65b0\u95fb", 606 | "favorited": false, 607 | "is_paid": false, 608 | "mblog_vip_type": 0, 609 | "user": { 610 | "id": 2978480200, 611 | "screen_name": "\u66fe\u601d\u6e90555", 612 | "profile_image_url": "https:\/\/tvax1.sinaimg.cn\/crop.14.0.721.721.180\/b1880048ly8fgjnua4z1gj20ku0k1gn4.jpg", 613 | "profile_url": "https:\/\/m.weibo.cn\/u\/2978480200?uid=2978480200&luicode=10000011&lfid=100103type%3D2%26q%3D%E7%89%B9%E6%9C%97%E6%99%AE&featurecode=20000320", 614 | "statuses_count": 15064, 615 | "verified": false, 616 | "verified_type": 220, 617 | "close_blue_v": false, 618 | "description": "\u996e\u6c34\u601d\u6e90\uff0c\u5c0a\u656c\u81f3\u4e0a\uff01", 619 | "gender": "m", 620 | "mbtype": 0, 621 | "urank": 39, 622 | "mbrank": 0, 623 | "follow_me": false, 624 | "following": false, 625 | "followers_count": 2278, 626 | "follow_count": 1205, 627 | "cover_image_phone": "https:\/\/tva1.sinaimg.cn\/crop.0.0.640.640.640\/549d0121tw1egm1kjly3jj20hs0hsq4f.jpg", 628 | "avatar_hd": "https:\/\/wx1.sinaimg.cn\/orj480\/b1880048ly8fgjnua4z1gj20ku0k1gn4.jpg" 629 | }, 630 | "reposts_count": 0, 631 | "comments_count": 0, 632 | "attitudes_count": 0, 633 | "pending_approval_count": 0, 634 | "isLongText": false, 635 | "visible": { 636 | "type": 0, 637 | "list_id": 0 638 | }, 639 | "rid": "8_0_0_2676184252325337115", 640 | "more_info_type": 0, 641 | "status": 0, 642 | "itemid": "seqid:1156122784|type:2|t:|pos:1-0-7|q:\u7279\u6717\u666e|ext:&mid=4172475753600035&", 643 | "bid": "Fupqnf6x5" 644 | }, 645 | "scheme": "https:\/\/m.weibo.cn\/status\/Fupqnf6x5?mblogid=Fupqnf6x5&luicode=10000011&lfid=100103type%3D2%26q%3D%E7%89%B9%E6%9C%97%E6%99%AE&featurecode=20000320" 646 | }, 647 | { 648 | "card_type": 9, 649 | "card_type_name": "\u5fae\u535a", 650 | "itemid": "seqid:1156122784|type:2|t:|pos:1-0-8|q:\u7279\u6717\u666e|ext:&mid=4172475753152500&", 651 | "actionlog": { 652 | "act_code": 554, 653 | "ext": "seqid:1156122784|type:2|t:|pos:1-0-8|q:\u7279\u6717\u666e|ext:&mid=4172475753152500&", 654 | "luicode": "", 655 | "uicode": "", 656 | "fid": "100103type=2&q=\u7279\u6717\u666e" 657 | }, 658 | "display_arrow": 0, 659 | "show_type": 1, 660 | "mblog": { 661 | "created_at": "7\u5206\u949f\u524d", 662 | "id": "4172475753152500", 663 | "mid": "4172475753152500", 664 | "idstr": "4172475753152500", 665 | "text": "#\u8fd1\u65e5\u65f6\u4e8b#<\/a>2017\u5e7411\u67088\u65e5\uff0c\u7f8e\u56fd\u56fd\u5bb6\u603b\u7edf\u7279\u6717\u666e\u53ca\u5176\u592b\u4eba\u4e00\u884c\u4eba\u62b5\u8fbe\u5317\u4eac\uff0c\u8fdb\u884c\u56fd\u4e8b\u8bbf\u95ee\u3002\u8fd9\u6b21\u7279\u6717\u666e\u8bbf\u534e\uff0c\u6709\u7740\u4e09\u4e2a\u7b2c\u4e00\u6b21\uff1a\u8fd9\u662f\u7279\u6717\u666e\u4f5c\u4e3a\u7f8e\u5229\u575a\u5408\u4f17\u56fd\u603b\u7edf\uff0c\u7b2c\u4e00\u6b21\u6765\u4e2d\u56fd\u8bbf\u95ee\u3002\u8fd9\u5e94\u8be5\u4e5f\u662f\u4ed6\u6765\u5230\u4eba\u4e1671\u5e74\u6765\uff0c\u7b2c\u4e00\u6b21\u6765\u4e2d\u56fd\u3002\u8fd9\u4e5f\u662f\u4e2d\u56fd\u5386\u53f2\u6027\u5927\u4f1a\u540e\uff0c\u7b2c\u4e00\u4e2a\u6765\u8bbf\u7684\u5916\u56fd\u56fd\u5bb6\u5143\u9996\u3002\u4e2d\u65b9\u6b64\u524d\u5c31\u8868\u793a\uff0c\u4e2d\u56fd\u5c06\u4ee5 \u200b...\u5168\u6587<\/a>", 666 | "textLength": 357, 667 | "source": "\u5fae\u535a weibo.com", 668 | "favorited": false, 669 | "thumbnail_pic": "http:\/\/wx3.sinaimg.cn\/thumbnail\/6feb6459ly1flc60v7f3fj20hi0bsjvl.jpg", 670 | "bmiddle_pic": "http:\/\/wx3.sinaimg.cn\/bmiddle\/6feb6459ly1flc60v7f3fj20hi0bsjvl.jpg", 671 | "original_pic": "http:\/\/wx3.sinaimg.cn\/large\/6feb6459ly1flc60v7f3fj20hi0bsjvl.jpg", 672 | "is_paid": false, 673 | "mblog_vip_type": 0, 674 | "user": { 675 | "id": 1877697625, 676 | "screen_name": "\u5317\u4eac\u7269\u8d44\u5b66\u9662\u5e7f\u64ad\u53f0", 677 | "profile_image_url": "https:\/\/tva3.sinaimg.cn\/crop.0.0.180.180.180\/6feb6459jw1e8qgp5bmzyj2050050aa8.jpg", 678 | "profile_url": "https:\/\/m.weibo.cn\/u\/1877697625?uid=1877697625&luicode=10000011&lfid=100103type%3D2%26q%3D%E7%89%B9%E6%9C%97%E6%99%AE&featurecode=20000320", 679 | "statuses_count": 4833, 680 | "verified": true, 681 | "verified_type": 4, 682 | "verified_type_ext": 0, 683 | "verified_reason": "\u5317\u4eac\u7269\u8d44\u5b66\u9662\u6821\u56ed\u5e7f\u64ad\u53f0\u5b98\u65b9\u5fae\u535a", 684 | "close_blue_v": false, 685 | "description": "\u5317\u4eac\u7269\u8d44\u5b66\u9662\u5e7f\u64ad\u53f0\u96b6\u5c5e\u4e8e\u515a\u59d4\u5ba3\u4f20\u90e8\u5927\u5b66\u751f\u8bb0\u8005\u56e2\u3002\u5982\u679c\u4f60\u8ba4\u4e3a\u5e7f\u64ad\u53f0\u53ea\u6709\u64ad\u97f3\u3001\u7f16\u8f91\uff0c\u90a3\u4f60\u5c31OUT\u5566\u3002\u7269\u9662\u4e4b\u58f0\u5e7f\u64ad\u53f0\u76ee\u524d\u6709\u516d\u90e8\uff08\u529e\u516c\u5ba4\u3001\u64ad\u97f3\u90e8\u3001\u7f16\u8f91\u90e8\u3001\u5ba3\u4f20\u90e8\u3001\u520a\u7269\u90e8\u3001\u5916\u8054\u90e8\uff09\u4e00\u7ec4\uff08\u82f1\u6587\u7ec4\uff09\u3002", 686 | "gender": "f", 687 | "mbtype": 0, 688 | "urank": 32, 689 | "mbrank": 0, 690 | "follow_me": false, 691 | "following": false, 692 | "followers_count": 2514, 693 | "follow_count": 559, 694 | "cover_image_phone": "https:\/\/tva1.sinaimg.cn\/crop.0.0.640.640.640\/549d0121tw1egm1kjly3jj20hs0hsq4f.jpg", 695 | "avatar_hd": "https:\/\/ww3.sinaimg.cn\/orj480\/6feb6459jw1e8qgp5bmzyj2050050aa8.jpg" 696 | }, 697 | "reposts_count": 0, 698 | "comments_count": 0, 699 | "attitudes_count": 0, 700 | "pending_approval_count": 0, 701 | "isLongText": true, 702 | "visible": { 703 | "type": 0, 704 | "list_id": 0 705 | }, 706 | "rid": "9_0_0_2676184252325337115", 707 | "more_info_type": 0, 708 | "status": 0, 709 | "itemid": "seqid:1156122784|type:2|t:|pos:1-0-8|q:\u7279\u6717\u666e|ext:&mid=4172475753152500&", 710 | "bid": "Fupqnde6M", 711 | "pics": [ 712 | { 713 | "pid": "6feb6459ly1flc60v7f3fj20hi0bsjvl", 714 | "url": "https:\/\/wx3.sinaimg.cn\/orj360\/6feb6459ly1flc60v7f3fj20hi0bsjvl.jpg", 715 | "size": "orj360", 716 | "geo": { 717 | "width": 401, 718 | "height": 270, 719 | "croped": false 720 | }, 721 | "large": { 722 | "size": "large", 723 | "url": "https:\/\/wx3.sinaimg.cn\/large\/6feb6459ly1flc60v7f3fj20hi0bsjvl.jpg", 724 | "geo": { 725 | "width": "630", 726 | "height": "424", 727 | "croped": false 728 | } 729 | } 730 | }, 731 | { 732 | "pid": "6feb6459ly1flc61s7ptqj20b70admyr", 733 | "url": "https:\/\/wx1.sinaimg.cn\/orj360\/6feb6459ly1flc61s7ptqj20b70admyr.jpg", 734 | "size": "orj360", 735 | "geo": { 736 | "width": 291, 737 | "height": 270, 738 | "croped": false 739 | }, 740 | "large": { 741 | "size": "large", 742 | "url": "https:\/\/wx1.sinaimg.cn\/large\/6feb6459ly1flc61s7ptqj20b70admyr.jpg", 743 | "geo": { 744 | "width": "403", 745 | "height": "373", 746 | "croped": false 747 | } 748 | } 749 | } 750 | ] 751 | }, 752 | "scheme": "https:\/\/m.weibo.cn\/status\/Fupqnde6M?mblogid=Fupqnde6M&luicode=10000011&lfid=100103type%3D2%26q%3D%E7%89%B9%E6%9C%97%E6%99%AE&featurecode=20000320" 753 | } 754 | ] 755 | } 756 | ], 757 | "ok": 1, 758 | "showAppTips": 0, 759 | "scheme": "sinaweibo:\/\/cardlist?containerid=100103type=2&q=\u7279\u6717\u666e&extparam=&luicode=10000011&lfid=106003type=1&featurecode=20000320" 760 | } 761 | --------------------------------------------------------------------------------