├── OopSpider
    └── oop
    │   ├── oop
    │       ├── __init__.py
    │       ├── spiders
    │       │   ├── __init__.py
    │       │   └── oop_spider.py
    │       ├── pipelines.py
    │       ├── items.py
    │       ├── middlewares.py
    │       └── settings.py
    │   └── scrapy.cfg
├── ZhenAiSpider
    └── ZhenAi
    │   ├── ZhenAi
    │       ├── __init__.py
    │       ├── spiders
    │       │   ├── __init__.py
    │       │   ├── utils.py
    │       │   ├── mymongo.py
    │       │   └── zhenai_spider.py
    │       ├── items.py
    │       ├── pipelines.py
    │       ├── middlewares.py
    │       └── settings.py
    │   └── scrapy.cfg
├── TMallCommentSpider
    ├── TMallCommentSpider
    │   ├── __init__.py
    │   ├── spiders
    │   │   ├── __init__.py
    │   │   ├── config.py
    │   │   └── tmall_comment_spider.py
    │   ├── pipelines.py
    │   ├── items.py
    │   ├── middlewares.py
    │   └── settings.py
    ├── README.md
    ├── scrapy.cfg
    ├── tmall_comment_selenium_spider.py
    └── tmall_comment_spider.py
├── BaiduZhidaoCommentSpider
    ├── README.md
    └── baidu_zhidao_comment_spider.py
├── README.md
├── MouserSpider
    └── myselenium.py
├── WeiboCommentSpider
    ├── README.md
    ├── weibocomment.py
    ├── weibo_comment_result_example.json
    └── weibo_search_result_example.json
├── JDCommentSpider
    ├── jdcomment.py
    └── README.md
├── LICENSE
└── ZhihuSpider
    └── zhihu_spider.py


/OopSpider/oop/oop/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/ZhenAiSpider/ZhenAi/ZhenAi/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/TMallCommentSpider/TMallCommentSpider/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/TMallCommentSpider/README.md:
--------------------------------------------------------------------------------
1 | # TMallSpider
2 | 暂时该项目不可用，尚未突破天猫反爬技术


--------------------------------------------------------------------------------
/OopSpider/oop/oop/spiders/__init__.py:
--------------------------------------------------------------------------------
1 | # This package will contain the spiders of your Scrapy project
2 | #
3 | # Please refer to the documentation for information on how to create and manage
4 | # your spiders.
5 | 


--------------------------------------------------------------------------------
/ZhenAiSpider/ZhenAi/ZhenAi/spiders/__init__.py:
--------------------------------------------------------------------------------
1 | # This package will contain the spiders of your Scrapy project
2 | #
3 | # Please refer to the documentation for information on how to create and manage
4 | # your spiders.
5 | 


--------------------------------------------------------------------------------
/TMallCommentSpider/TMallCommentSpider/spiders/__init__.py:
--------------------------------------------------------------------------------
1 | # This package will contain the spiders of your Scrapy project
2 | #
3 | # Please refer to the documentation for information on how to create and manage
4 | # your spiders.
5 | 


--------------------------------------------------------------------------------
/BaiduZhidaoCommentSpider/README.md:
--------------------------------------------------------------------------------
 1 | # BaiduZhidaoCommentSpider
 2 | 搜索关键字后遍历搜索结果，对评论页进行抓取
 3 | 
 4 | ## 缺陷
 5 | 1. 可以转成`scrapy`的`CrawlSpider`实现，但没做
 6 | 2. 百度反爬技术：把评论的部分字换成图片，可通过图像识别解决，暂时没做
 7 | 
 8 | ## api
 9 | 网页`html`代码，无`json`或其他格式的`REST`接口
10 | 


--------------------------------------------------------------------------------
/TMallCommentSpider/TMallCommentSpider/spiders/config.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | # @Author : Huangcc
 3 | 
 4 | import urllib
 5 | 
 6 | 
 7 | search_keyword = '美年健康'
 8 | 
 9 | 
10 | if __name__ == '__main__':
11 |     print urllib.quote(search_keyword.decode('utf-8').encode('gbk'))
12 | 


--------------------------------------------------------------------------------
/OopSpider/oop/scrapy.cfg:
--------------------------------------------------------------------------------
 1 | # Automatically created by: scrapy startproject
 2 | #
 3 | # For more information about the [deploy] section see:
 4 | # https://scrapyd.readthedocs.org/en/latest/deploy.html
 5 | 
 6 | [settings]
 7 | default = oop.settings
 8 | 
 9 | [deploy]
10 | #url = http://localhost:6800/
11 | project = oop
12 | 


--------------------------------------------------------------------------------
/ZhenAiSpider/ZhenAi/scrapy.cfg:
--------------------------------------------------------------------------------
 1 | # Automatically created by: scrapy startproject
 2 | #
 3 | # For more information about the [deploy] section see:
 4 | # https://scrapyd.readthedocs.org/en/latest/deploy.html
 5 | 
 6 | [settings]
 7 | default = ZhenAi.settings
 8 | 
 9 | [deploy]
10 | #url = http://localhost:6800/
11 | project = ZhenAi
12 | 


--------------------------------------------------------------------------------
/TMallCommentSpider/scrapy.cfg:
--------------------------------------------------------------------------------
 1 | # Automatically created by: scrapy startproject
 2 | #
 3 | # For more information about the [deploy] section see:
 4 | # https://scrapyd.readthedocs.org/en/latest/deploy.html
 5 | 
 6 | [settings]
 7 | default = TMallCommentSpider.settings
 8 | 
 9 | [deploy]
10 | #url = http://localhost:6800/
11 | project = TMallCommentSpider
12 | 


--------------------------------------------------------------------------------
/TMallCommentSpider/TMallCommentSpider/pipelines.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Define your item pipelines here
 4 | #
 5 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting
 6 | # See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
 7 | 
 8 | 
 9 | class TmallcommentspiderPipeline(object):
10 |     def process_item(self, item, spider):
11 |         return item
12 | 


--------------------------------------------------------------------------------
/TMallCommentSpider/TMallCommentSpider/items.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Define here the models for your scraped items
 4 | #
 5 | # See documentation in:
 6 | # http://doc.scrapy.org/en/latest/topics/items.html
 7 | 
 8 | import scrapy
 9 | 
10 | 
11 | class TmallcommentspiderItem(scrapy.Item):
12 |     # define the fields for your item here like:
13 |     # name = scrapy.Field()
14 |     pass
15 | 


--------------------------------------------------------------------------------
/TMallCommentSpider/tmall_comment_selenium_spider.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | # @Author : Huangcc
 3 | 
 4 | from selenium import webdriver
 5 | 
 6 | browser = webdriver.Firefox()
 7 | browser.get('https://list.tmall.com/search_product.htm?q=%C3%C0%C4%EA%BD%A1%BF%B5&type=p&vmarket=&spm=875.7931836%2FB.a2227oh.d100&from=mallfp..pc_1_searchbutton')
 8 | html_source = browser.page_source
 9 | print html_source
10 | browser.close()


--------------------------------------------------------------------------------
/ZhenAiSpider/ZhenAi/ZhenAi/items.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Define here the models for your scraped items
 4 | #
 5 | # See documentation in:
 6 | # http://doc.scrapy.org/en/latest/topics/items.html
 7 | 
 8 | import scrapy
 9 | 
10 | 
11 | class ZhenaiItem(scrapy.Item):
12 |     # define the fields for your item here like:
13 |     # name = scrapy.Field()
14 |     pic_url = scrapy.Field()  # used for pipeline to download
15 |     pass
16 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Spider
 2 | Some spiders for some webs
 3 | 
 4 | ## JDCommentSpider
 5 | 京东商品评论定向爬虫
 6 | 
 7 | 标签：`Python`, `requests`
 8 | 
 9 | ## MouserSpider
10 | Mouser网站爬虫
11 | 
12 | 标签: `Python`, `requests`, `selenium`
13 | 
14 | ## OopSpider
15 | 面向对象 相亲网站爬虫
16 | 
17 | 标签: `Python`, `scrapy`
18 | 
19 | ## ZhenAiSpider
20 | 珍爱网 相亲网站爬虫
21 | 
22 | 标签: `Python`, `scrapy`
23 | 
24 | ## WeiboCommentSpider
25 | 新浪微博 搜索结果的评论爬虫
26 | 
27 | 标签: `Python`, `requests`
28 | 


--------------------------------------------------------------------------------
/ZhenAiSpider/ZhenAi/ZhenAi/pipelines.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Define your item pipelines here
 4 | #
 5 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting
 6 | # See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
 7 | from scrapy.pipelines.images import ImagesPipeline
 8 | from scrapy.http import Request
 9 | 
10 | 
11 | class ZhenAiImagePipline(ImagesPipeline):
12 |     def get_media_requests(self, item, info):
13 |         for url in item['pic_url']:
14 |             yield Request(url)
15 | 


--------------------------------------------------------------------------------
/ZhenAiSpider/ZhenAi/ZhenAi/spiders/utils.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | import re
 4 | from itertools import product
 5 | 
 6 | 
 7 | def get_brief_td_to_key_value(td):
 8 |     p = '<span.*?>(.*?)[：:]?\s?</span>(.*?)</td>'
 9 |     result = re.findall(p, td)
10 |     if len(result) == 1:
11 |         return result[0][0].replace(u'\uff1a', ''), result[0][1]
12 |     else:
13 |         return None, None
14 | 
15 | 
16 | def get_info_td_to_key_value(td):
17 |     p = '<span.*?>(.*?)[：:]?\s?</span>'
18 |     result = re.findall(p, td)
19 |     if len(result) == 2:
20 |         return result[0].replace(u'\uff1a', ''), result[1]
21 |     else:
22 |         return None, None
23 | 
24 | 
25 | def url_generator(url, *args):
26 |     for items in product(*args):
27 |         yield url.format(*items)
28 | 


--------------------------------------------------------------------------------
/OopSpider/oop/oop/pipelines.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Define your item pipelines here
 4 | #
 5 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting
 6 | # See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
 7 | from scrapy.contrib.pipeline.images import ImagesPipeline
 8 | from scrapy.http import Request
 9 | 
10 | 
11 | class OopImagesPipeline(ImagesPipeline):
12 | 
13 |     def get_media_requests(self, item, info):
14 |         for image_url in item['pic_url']:
15 |             yield Request(image_url)
16 | 
17 |     def item_completed(self, results, item, info):
18 |         # results - [(success, image_info_or_failure)] image_info - {url: x, path: x, checksum: x}
19 |         pic_paths = []
20 |         for success, image_info_or_failure in results:
21 |             if success:
22 |                 pic_paths.append(image_info_or_failure['path'])
23 |             else:
24 |                 pic_paths.append([])
25 |         item['pic_path'] = pic_paths
26 |         return item
27 | 


--------------------------------------------------------------------------------
/OopSpider/oop/oop/items.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Define here the models for your scraped items
 4 | #
 5 | # See documentation in:
 6 | # http://doc.scrapy.org/en/latest/topics/items.html
 7 | 
 8 | import scrapy
 9 | 
10 | 
11 | class OopItem(scrapy.Item):
12 |     # define the fields for your item here like:
13 |     # name = scrapy.Field()
14 |     url = scrapy.Field()
15 |     title = scrapy.Field()
16 |     date = scrapy.Field()
17 |     location = scrapy.Field()
18 |     birth = scrapy.Field()
19 |     tall = scrapy.Field()
20 |     work_city = scrapy.Field()
21 |     born_city = scrapy.Field()
22 |     work = scrapy.Field()
23 |     parent = scrapy.Field()
24 |     only_child = scrapy.Field()
25 |     rich = scrapy.Field()
26 |     interest = scrapy.Field()
27 |     distance_love = scrapy.Field()
28 |     year_married = scrapy.Field()
29 |     num_child = scrapy.Field()
30 |     lowest_command = scrapy.Field()
31 |     special_command = scrapy.Field()
32 |     introduction = scrapy.Field()
33 |     pic_url = scrapy.Field()
34 |     pic_path = scrapy.Field()
35 |     pass
36 | 


--------------------------------------------------------------------------------
/ZhenAiSpider/ZhenAi/ZhenAi/spiders/mymongo.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | import pymongo
 4 | import traceback
 5 | 
 6 | 
 7 | class MyMongo(object):
 8 |     def __init__(self, host='127.0.0.1', port=27017):
 9 |         self.client = None
10 |         self.db = None
11 |         try:
12 |             self.client = pymongo.MongoClient(host=host, port=port)
13 |         except:
14 |             traceback.print_exc()
15 |             self.client = None
16 | 
17 |     def get_db(self, db_name):
18 |         if self.client:
19 |             try:
20 |                 self.db = self.client[db_name]
21 |             except:
22 |                 traceback.print_exc()
23 | 
24 |     def insert_doc(self, db_name, doc_name, doc_list):
25 |         # data_dic - {data:[doc1, doc2...docN]}
26 |         self.get_db(db_name)  # get database to self.db
27 |         if self.db is None:
28 |             return False
29 |         try:
30 |             doc = self.db[doc_name]
31 |             doc.insert(doc_list)
32 |         except:
33 |             traceback.print_exc()
34 |             return False
35 |         return True
36 | 


--------------------------------------------------------------------------------
/MouserSpider/myselenium.py:
--------------------------------------------------------------------------------
 1 | # -*- coding:utf-8 -*-
 2 | 
 3 | from selenium import webdriver
 4 | import requests
 5 | import sqlite3
 6 | 
 7 | browser = webdriver.Firefox()
 8 | browser.get('http://www.mouser.cn')
 9 | html_source = browser.page_source
10 | print html_source
11 | 
12 | coon = sqlite3.connect('/root/.mozilla/firefox/gmfs2ivm.default/cookies.sqlite')
13 | cursor = coon.cursor()
14 | cursor.execute('select name, value from moz_cookies where baseDomain="mouser.cn"')
15 | cookies = cursor.fetchall()
16 | coon.close()
17 | 
18 | 
19 | cookie=[item[0]+"="+item[1]for item in cookies]
20 | 
21 | cookiestr=';'.join(item for item in cookie)
22 | 
23 | print cookiestr
24 | 
25 | myheaders = { 
26 |     'Host': 'www.mouser.cn',
27 |     'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64; rv:53.0) Gecko/20100101 Firefox/53.0',
28 |     'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
29 |     'Accept-Language': 'zh-CN,zh;q=0.8,en-US;q=0.5,en;q=0.3',
30 |     'Accept-Encoding': 'gzip, deflate',
31 |     'Upgrade-Insecure-Requests': '1',
32 |     'If-None-Match': "76b9f323a7b0ec42447e8435c1bc98bd",
33 |     'Cache-Control': 'max-age=0',
34 |     'Cookie':cookiestr
35 | }
36 | 
37 | s = requests.session()
38 | #r = s.get('http://www.mouser.cn/Semiconductors/RF-Semiconductors/_/N-96p9c/', headers=myheaders)
39 | r = s.get('http://www.mouser.cn/Semiconductors/RF-Semiconductors/_/N-96p9c/', headers=myheaders)
40 | 
41 | data = r.content
42 | 
43 | f = open('data.html', 'w')
44 | f.write(data)
45 | f.close()
46 | 
47 | browser.close()
48 | 


--------------------------------------------------------------------------------
/OopSpider/oop/oop/middlewares.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Define here the models for your spider middleware
 4 | #
 5 | # See documentation in:
 6 | # http://doc.scrapy.org/en/latest/topics/spider-middleware.html
 7 | 
 8 | from scrapy import signals
 9 | 
10 | 
11 | class OopSpiderMiddleware(object):
12 |     # Not all methods need to be defined. If a method is not defined,
13 |     # scrapy acts as if the spider middleware does not modify the
14 |     # passed objects.
15 | 
16 |     @classmethod
17 |     def from_crawler(cls, crawler):
18 |         # This method is used by Scrapy to create your spiders.
19 |         s = cls()
20 |         crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
21 |         return s
22 | 
23 |     def process_spider_input(response, spider):
24 |         # Called for each response that goes through the spider
25 |         # middleware and into the spider.
26 | 
27 |         # Should return None or raise an exception.
28 |         return None
29 | 
30 |     def process_spider_output(response, result, spider):
31 |         # Called with the results returned from the Spider, after
32 |         # it has processed the response.
33 | 
34 |         # Must return an iterable of Request, dict or Item objects.
35 |         for i in result:
36 |             yield i
37 | 
38 |     def process_spider_exception(response, exception, spider):
39 |         # Called when a spider or process_spider_input() method
40 |         # (from other spider middleware) raises an exception.
41 | 
42 |         # Should return either None or an iterable of Response, dict
43 |         # or Item objects.
44 |         pass
45 | 
46 |     def process_start_requests(start_requests, spider):
47 |         # Called with the start requests of the spider, and works
48 |         # similarly to the process_spider_output() method, except
49 |         # that it doesn’t have a response associated.
50 | 
51 |         # Must return only requests (not items).
52 |         for r in start_requests:
53 |             yield r
54 | 
55 |     def spider_opened(self, spider):
56 |         spider.logger.info('Spider opened: %s' % spider.name)
57 | 


--------------------------------------------------------------------------------
/ZhenAiSpider/ZhenAi/ZhenAi/middlewares.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Define here the models for your spider middleware
 4 | #
 5 | # See documentation in:
 6 | # http://doc.scrapy.org/en/latest/topics/spider-middleware.html
 7 | 
 8 | from scrapy import signals
 9 | 
10 | 
11 | class ZhenaiSpiderMiddleware(object):
12 |     # Not all methods need to be defined. If a method is not defined,
13 |     # scrapy acts as if the spider middleware does not modify the
14 |     # passed objects.
15 | 
16 |     @classmethod
17 |     def from_crawler(cls, crawler):
18 |         # This method is used by Scrapy to create your spiders.
19 |         s = cls()
20 |         crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
21 |         return s
22 | 
23 |     def process_spider_input(response, spider):
24 |         # Called for each response that goes through the spider
25 |         # middleware and into the spider.
26 | 
27 |         # Should return None or raise an exception.
28 |         return None
29 | 
30 |     def process_spider_output(response, result, spider):
31 |         # Called with the results returned from the Spider, after
32 |         # it has processed the response.
33 | 
34 |         # Must return an iterable of Request, dict or Item objects.
35 |         for i in result:
36 |             yield i
37 | 
38 |     def process_spider_exception(response, exception, spider):
39 |         # Called when a spider or process_spider_input() method
40 |         # (from other spider middleware) raises an exception.
41 | 
42 |         # Should return either None or an iterable of Response, dict
43 |         # or Item objects.
44 |         pass
45 | 
46 |     def process_start_requests(start_requests, spider):
47 |         # Called with the start requests of the spider, and works
48 |         # similarly to the process_spider_output() method, except
49 |         # that it doesn’t have a response associated.
50 | 
51 |         # Must return only requests (not items).
52 |         for r in start_requests:
53 |             yield r
54 | 
55 |     def spider_opened(self, spider):
56 |         spider.logger.info('Spider opened: %s' % spider.name)
57 | 


--------------------------------------------------------------------------------
/TMallCommentSpider/TMallCommentSpider/middlewares.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Define here the models for your spider middleware
 4 | #
 5 | # See documentation in:
 6 | # http://doc.scrapy.org/en/latest/topics/spider-middleware.html
 7 | 
 8 | from scrapy import signals
 9 | 
10 | 
11 | class TmallcommentspiderSpiderMiddleware(object):
12 |     # Not all methods need to be defined. If a method is not defined,
13 |     # scrapy acts as if the spider middleware does not modify the
14 |     # passed objects.
15 | 
16 |     @classmethod
17 |     def from_crawler(cls, crawler):
18 |         # This method is used by Scrapy to create your spiders.
19 |         s = cls()
20 |         crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
21 |         return s
22 | 
23 |     def process_spider_input(self, response, spider):
24 |         # Called for each response that goes through the spider
25 |         # middleware and into the spider.
26 | 
27 |         # Should return None or raise an exception.
28 |         return None
29 | 
30 |     def process_spider_output(self, response, result, spider):
31 |         # Called with the results returned from the Spider, after
32 |         # it has processed the response.
33 | 
34 |         # Must return an iterable of Request, dict or Item objects.
35 |         for i in result:
36 |             yield i
37 | 
38 |     def process_spider_exception(self, response, exception, spider):
39 |         # Called when a spider or process_spider_input() method
40 |         # (from other spider middleware) raises an exception.
41 | 
42 |         # Should return either None or an iterable of Response, dict
43 |         # or Item objects.
44 |         pass
45 | 
46 |     def process_start_requests(self, start_requests, spider):
47 |         # Called with the start requests of the spider, and works
48 |         # similarly to the process_spider_output() method, except
49 |         # that it doesn’t have a response associated.
50 | 
51 |         # Must return only requests (not items).
52 |         for r in start_requests:
53 |             yield r
54 | 
55 |     def spider_opened(self, spider):
56 |         spider.logger.info('Spider opened: %s' % spider.name)
57 | 


--------------------------------------------------------------------------------
/OopSpider/oop/oop/spiders/oop_spider.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | import scrapy
 4 | from oop.items import OopItem
 5 | from scrapy.contrib.spiders import CrawlSpider
 6 | from scrapy.contrib.spiders import Rule
 7 | from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
 8 | from scrapy.utils.response import get_base_url
 9 | 
10 | 
11 | class OopSpider(CrawlSpider):
12 |     name = 'OopSpider'
13 |     allowed_domains = ['date.jobbole.com']
14 |     start_urls = ['http://date.jobbole.com']
15 |     rules = [
16 |         Rule(SgmlLinkExtractor(allow=('/page/\d{1,3}/')), follow=True, callback='parse_item')
17 |     ]
18 |     index_flag = True
19 | 
20 |     def parse_item(self, response):
21 |         # add front page search
22 |         if OopSpider.index_flag:
23 |             OopSpider.index_flag = False
24 |             yield scrapy.Request(OopSpider.start_urls[0], callback=self.parse_item)
25 | 
26 |         html_selector = scrapy.Selector(response)
27 |         urls = html_selector.xpath('//li[@class="media"]/div/h3/a/@href').extract()
28 |         for url in urls:
29 |             yield scrapy.Request(url, callback=self.parse_detail_item)
30 | 
31 |     def parse_detail_item(self, response):
32 |         items = OopItem()
33 |         html_selector = scrapy.Selector(response)
34 | 
35 |         items['url'] = get_base_url(response)
36 | 
37 |         head_prefix = '//div[@class="p-single"]'
38 |         items['title'] = html_selector.xpath(head_prefix + '//h1/text()').extract()
39 |         items['date'] = html_selector.xpath(head_prefix + '//p[@class="p-meta"]/span[1]/text()').extract()
40 |         items['location'] = html_selector.xpath(head_prefix + '//p[@class="p-meta"]/span[2]/a/text()').extract()
41 | 
42 |         detail_prefix = '//div[@class="p-entry"]'
43 |         details = html_selector.xpath(detail_prefix + '/p/text()').extract()
44 |         details = map(lambda x: x.replace('\n', ''), details)
45 |         items['birth'] = details[0]
46 |         items['tall'] = details[1]
47 |         items['work_city'] = details[2]
48 |         items['born_city'] = details[3]
49 |         items['work'] = details[4]
50 |         items['parent'] = details[5]
51 |         items['only_child'] = details[6]
52 |         items['rich'] = details[7]
53 |         items['interest'] = details[8]
54 |         items['distance_love'] = details[9]
55 |         items['year_married'] = details[10]
56 |         items['num_child'] = details[11]
57 |         items['lowest_command'] = details[12]
58 |         items['special_command'] = details[13]
59 |         items['introduction'] = details[14]
60 |         items['pic_url'] = html_selector.xpath(detail_prefix + '/p/img/@src').extract()
61 |         return items
62 | 


--------------------------------------------------------------------------------
/WeiboCommentSpider/README.md:
--------------------------------------------------------------------------------
 1 | # WeiboCommentSpider
 2 | 微博评论爬虫
 3 | 
 4 | ## 工作原理
 5 | 使用微博登录接口进行模拟登陆获取`cookies`后，在`m.weibo.cn`上进行为所欲为地搜索和评论爬取
 6 | 
 7 | ## 缺陷
 8 | 1. 只能取最新的1000条微博搜索结果
 9 | 2. 串行版
10 | 3. 以`无BOM的utf-8`进行编码，得到的csv文件无法在`Microsoft Office Excel`直接打开，建议用`Notepad++`转成`带BOM的utf-8编码`
11 | 
12 | ## 登录api
13 | ### 实现
14 | ```python
15 | def login(self):
16 |     user = self.get_b64_username()  # 用户名先进行urlencode然后base64编码
17 |     passwd = self.get_rsa_password(self.pubkey, self.nonce, self.servertime)
18 |     self.login_params['su'] = user
19 |     self.login_params['servertime'] = self.servertime
20 |     self.login_params['nonce'] = self.nonce
21 |     self.login_params['rsakv'] = self.rsakv
22 |     self.login_params['sp'] = passwd
23 | 
24 |     resp = self.session.post(self.login_url, data=self.login_params, headers=self.login_headers)
25 |     if 'retcode%3D0' in resp.content:
26 |         print 'login success'
27 |         return True
28 |     print 'login fail'
29 |     print resp.content
30 |     return False
31 | ```
32 | ### 原理
33 | 先从`https://login.sina.com.cn/sso/prelogin.php`获得加密所需的参数`pubkey`, `servertime`, `nonce`, `rsakv`,
34 | 然后对密码进行rsa2加密，js加密代码如下，最后发送到服务器登录接口进行登录
35 | ```javascript
36 | var RSAKey = new sinaSSOEncoder.RSAKey();
37 | RSAKey.setPublic(me.rsaPubkey, "10001");
38 | password = RSAKey.encrypt([me.servertime, me.nonce].join("\t") + "\n" + password)
39 | ```
40 | 
41 | ## 关于微博和评论api
42 | `m.weibo.cn`是一个神奇的入口，登录使用的是明文的`password`，接口返回的数据是`json`格式，
43 | 但是只能获取按时间排序的`1000`条微博。而`weibo.cn`则使用`rsa2`加密的密码，数据加载方式使用令人窒息的`ajax`的动态加载，
44 | 返回的数据是`html`/`js`文本。由于时间成本，暂用`m.weibo.cn`入口，后续有需要再继续分析
45 | 
46 | ## 微博api
47 | ### headers
48 | ```python
49 | self.search_headers = {
50 |     'Host': 'm.weibo.cn',
51 |     'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64; rv:52.0) Gecko/20100101 Firefox/52.0',
52 |     'Accept': 'application/json, text/plain, */*',
53 |     'Accept-Language': 'zh-CN,zh;q=0.8,en-US;q=0.5,en;q=0.3',
54 |     'Accept-Encoding': 'gzip, deflate, br',
55 |     'X-Requested-With': 'XMLHttpRequest',
56 |     'Referer': '',
57 | }
58 | self.search_referer = 'https://m.weibo.cn/p/100103type%3D2%26q%3D{keyword}?type=wb&queryVal={keyword}' \
59 |                       '&featurecode=20000320&luicode=10000011&lfid=106003type%3D1&title={keyword}'
60 | ```
61 | ### params
62 | ```python
63 | self.search_data = {
64 |     'type': 'wb',
65 |     'queryVal': '{keyword}',
66 |     'featurecode': '20000320',
67 |     'luicode': '10000011',
68 |     'lfid': '106003type=1',
69 |     'title': '{keyword}',
70 |     'containerid': '100103type=2&q={keyword}',
71 | }
72 | ```
73 | ### response
74 | 详见[weibo_search_result_example.json](weibo_search_result_example.json)
75 | 
76 | ## 评论api
77 | ### headers
78 | `Referer`校验
79 | ```python
80 | self.comment_headers = {
81 |     'Host': 'm.weibo.cn',
82 |     'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64; rv:52.0) Gecko/20100101 Firefox/52.0',
83 |     'Accept': 'application/json, text/plain, */*',
84 |     'Accept-Language': 'zh-CN,zh;q=0.8,en-US;q=0.5,en;q=0.3',
85 |     'Accept-Encoding': 'gzip, deflate, br',
86 |     'X-Requested-With': 'XMLHttpRequest',
87 |     # 'Referer': 'https://m.weibo.cn/status/4172328575544621',
88 |     'Connection': 'keep-alive',
89 | }
90 | ```
91 | ### params
92 | ```python
93 | self.comment_data = {
94 |     'id': '{id}',  # 微博id
95 |     'page': '{page}',  # 第x页
96 | }
97 | ```
98 | ### response
99 | 详见[weibo_comment_result_example.json](weibo_comment_result_example.json)


--------------------------------------------------------------------------------
/OopSpider/oop/oop/settings.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Scrapy settings for oop project
 4 | #
 5 | # For simplicity, this file contains only settings considered important or
 6 | # commonly used. You can find more settings consulting the documentation:
 7 | #
 8 | #     http://doc.scrapy.org/en/latest/topics/settings.html
 9 | #     http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
10 | #     http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
11 | 
12 | BOT_NAME = 'oop'
13 | 
14 | SPIDER_MODULES = ['oop.spiders']
15 | NEWSPIDER_MODULE = 'oop.spiders'
16 | 
17 | 
18 | # Crawl responsibly by identifying yourself (and your website) on the user-agent
19 | #USER_AGENT = 'oop (+http://www.yourdomain.com)'
20 | 
21 | # Obey robots.txt rules
22 | ROBOTSTXT_OBEY = False
23 | 
24 | # Configure maximum concurrent requests performed by Scrapy (default: 16)
25 | #CONCURRENT_REQUESTS = 32
26 | 
27 | # Configure a delay for requests for the same website (default: 0)
28 | # See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay
29 | # See also autothrottle settings and docs
30 | #DOWNLOAD_DELAY = 3
31 | # The download delay setting will honor only one of:
32 | #CONCURRENT_REQUESTS_PER_DOMAIN = 16
33 | #CONCURRENT_REQUESTS_PER_IP = 16
34 | 
35 | # Disable cookies (enabled by default)
36 | #COOKIES_ENABLED = False
37 | 
38 | # Disable Telnet Console (enabled by default)
39 | #TELNETCONSOLE_ENABLED = False
40 | 
41 | # Override the default request headers:
42 | #DEFAULT_REQUEST_HEADERS = {
43 | #   'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
44 | #   'Accept-Language': 'en',
45 | #}
46 | 
47 | # Enable or disable spider middlewares
48 | # See http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
49 | #SPIDER_MIDDLEWARES = {
50 | #    'oop.middlewares.OopSpiderMiddleware': 543,
51 | #}
52 | 
53 | # Enable or disable downloader middlewares
54 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
55 | #DOWNLOADER_MIDDLEWARES = {
56 | #    'oop.middlewares.MyCustomDownloaderMiddleware': 543,
57 | #}
58 | 
59 | # Enable or disable extensions
60 | # See http://scrapy.readthedocs.org/en/latest/topics/extensions.html
61 | #EXTENSIONS = {
62 | #    'scrapy.extensions.telnet.TelnetConsole': None,
63 | #}
64 | 
65 | # Configure item pipelines
66 | # See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html
67 | ITEM_PIPELINES = {
68 |     'oop.pipelines.OopImagesPipeline': 300,
69 | }
70 | 
71 | # ImagePipeline Setting
72 | IMAGES_STORE = './oop/pic'
73 | IMAGES_EXPIRES = 90
74 | 
75 | # Enable and configure the AutoThrottle extension (disabled by default)
76 | # See http://doc.scrapy.org/en/latest/topics/autothrottle.html
77 | #AUTOTHROTTLE_ENABLED = True
78 | # The initial download delay
79 | #AUTOTHROTTLE_START_DELAY = 5
80 | # The maximum download delay to be set in case of high latencies
81 | #AUTOTHROTTLE_MAX_DELAY = 60
82 | # The average number of requests Scrapy should be sending in parallel to
83 | # each remote server
84 | #AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
85 | # Enable showing throttling stats for every response received:
86 | #AUTOTHROTTLE_DEBUG = False
87 | 
88 | # Enable and configure HTTP caching (disabled by default)
89 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
90 | #HTTPCACHE_ENABLED = True
91 | #HTTPCACHE_EXPIRATION_SECS = 0
92 | #HTTPCACHE_DIR = 'httpcache'
93 | #HTTPCACHE_IGNORE_HTTP_CODES = []
94 | #HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
95 | 


--------------------------------------------------------------------------------
/TMallCommentSpider/TMallCommentSpider/settings.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Scrapy settings for TMallCommentSpider project
 4 | #
 5 | # For simplicity, this file contains only settings considered important or
 6 | # commonly used. You can find more settings consulting the documentation:
 7 | #
 8 | #     http://doc.scrapy.org/en/latest/topics/settings.html
 9 | #     http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
10 | #     http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
11 | 
12 | BOT_NAME = 'TMallCommentSpider'
13 | 
14 | SPIDER_MODULES = ['TMallCommentSpider.spiders']
15 | NEWSPIDER_MODULE = 'TMallCommentSpider.spiders'
16 | 
17 | 
18 | # Crawl responsibly by identifying yourself (and your website) on the user-agent
19 | #USER_AGENT = 'TMallCommentSpider (+http://www.yourdomain.com)'
20 | 
21 | # Obey robots.txt rules
22 | ROBOTSTXT_OBEY = False
23 | 
24 | # Configure maximum concurrent requests performed by Scrapy (default: 16)
25 | #CONCURRENT_REQUESTS = 32
26 | 
27 | # Configure a delay for requests for the same website (default: 0)
28 | # See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay
29 | # See also autothrottle settings and docs
30 | #DOWNLOAD_DELAY = 3
31 | # The download delay setting will honor only one of:
32 | #CONCURRENT_REQUESTS_PER_DOMAIN = 16
33 | #CONCURRENT_REQUESTS_PER_IP = 16
34 | 
35 | # Disable cookies (enabled by default)
36 | #COOKIES_ENABLED = False
37 | 
38 | # Disable Telnet Console (enabled by default)
39 | #TELNETCONSOLE_ENABLED = False
40 | 
41 | # Override the default request headers:
42 | #DEFAULT_REQUEST_HEADERS = {
43 | #   'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
44 | #   'Accept-Language': 'en',
45 | #}
46 | DEFAULT_REQUEST_HEADERS = {
47 |     'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:57.0) Gecko/20100101 Firefox/57.0',
48 |     'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
49 |     'Accept-Language': 'en-US,en;q=0.5',
50 |     'Accept-Encoding': 'gzip, deflate, br',
51 |     'Connection': 'keep-alive',
52 |     'Upgrade-Insecure-Requests': '1',
53 | }
54 | 
55 | # Enable or disable spider middlewares
56 | # See http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
57 | #SPIDER_MIDDLEWARES = {
58 | #    'TMallCommentSpider.middlewares.TmallcommentspiderSpiderMiddleware': 543,
59 | #}
60 | 
61 | # Enable or disable downloader middlewares
62 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
63 | #DOWNLOADER_MIDDLEWARES = {
64 | #    'TMallCommentSpider.middlewares.MyCustomDownloaderMiddleware': 543,
65 | #}
66 | 
67 | # Enable or disable extensions
68 | # See http://scrapy.readthedocs.org/en/latest/topics/extensions.html
69 | #EXTENSIONS = {
70 | #    'scrapy.extensions.telnet.TelnetConsole': None,
71 | #}
72 | 
73 | # Configure item pipelines
74 | # See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html
75 | #ITEM_PIPELINES = {
76 | #    'TMallCommentSpider.pipelines.TmallcommentspiderPipeline': 300,
77 | #}
78 | 
79 | # Enable and configure the AutoThrottle extension (disabled by default)
80 | # See http://doc.scrapy.org/en/latest/topics/autothrottle.html
81 | #AUTOTHROTTLE_ENABLED = True
82 | # The initial download delay
83 | #AUTOTHROTTLE_START_DELAY = 5
84 | # The maximum download delay to be set in case of high latencies
85 | #AUTOTHROTTLE_MAX_DELAY = 60
86 | # The average number of requests Scrapy should be sending in parallel to
87 | # each remote server
88 | #AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
89 | # Enable showing throttling stats for every response received:
90 | #AUTOTHROTTLE_DEBUG = False
91 | 
92 | # Enable and configure HTTP caching (disabled by default)
93 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
94 | #HTTPCACHE_ENABLED = True
95 | #HTTPCACHE_EXPIRATION_SECS = 0
96 | #HTTPCACHE_DIR = 'httpcache'
97 | #HTTPCACHE_IGNORE_HTTP_CODES = []
98 | #HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
99 | 


--------------------------------------------------------------------------------
/ZhenAiSpider/ZhenAi/ZhenAi/settings.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | 
  3 | # Scrapy settings for ZhenAi project
  4 | #
  5 | # For simplicity, this file contains only settings considered important or
  6 | # commonly used. You can find more settings consulting the documentation:
  7 | #
  8 | #     http://doc.scrapy.org/en/latest/topics/settings.html
  9 | #     http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
 10 | #     http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
 11 | 
 12 | BOT_NAME = 'ZhenAi'
 13 | 
 14 | SPIDER_MODULES = ['ZhenAi.spiders']
 15 | NEWSPIDER_MODULE = 'ZhenAi.spiders'
 16 | 
 17 | 
 18 | # Crawl responsibly by identifying yourself (and your website) on the user-agent
 19 | #USER_AGENT = 'ZhenAi (+http://www.yourdomain.com)'
 20 | 
 21 | # Obey robots.txt rules
 22 | ROBOTSTXT_OBEY = False
 23 | 
 24 | # Configure maximum concurrent requests performed by Scrapy (default: 16)
 25 | #CONCURRENT_REQUESTS = 32
 26 | 
 27 | # Configure a delay for requests for the same website (default: 0)
 28 | # See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay
 29 | # See also autothrottle settings and docs
 30 | #DOWNLOAD_DELAY = 3
 31 | # The download delay setting will honor only one of:
 32 | #CONCURRENT_REQUESTS_PER_DOMAIN = 16
 33 | #CONCURRENT_REQUESTS_PER_IP = 16
 34 | 
 35 | # Disable cookies (enabled by default)
 36 | #COOKIES_ENABLED = False
 37 | 
 38 | # Disable Telnet Console (enabled by default)
 39 | #TELNETCONSOLE_ENABLED = False
 40 | 
 41 | # Override the default request headers:
 42 | DEFAULT_REQUEST_HEADERS = {
 43 |    'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:54.0) Gecko/20100101 Firefox/54.0',
 44 |    'Accept': 'application/json, text/javascript, */*; q=0.01',
 45 |    'Accept-Language': 'zh-CN,zh;q=0.8,en-US;q=0.5,en;q=0.3',
 46 |    'Accept-Encoding': 'gzip, deflate',
 47 |    'X-Requested-With': 'XMLHttpRequest',
 48 |    'Referer': 'http://search.zhenai.com/v2/search/pinterest.do?'
 49 |               'sex=1&agebegin=18&ageend=-1&workcityprovince=-1&workcitycity=-1'
 50 |               '&info=&h1=-1&h2=-1&salaryBegin=-1&salaryEnd=-1&occupation=-1&h=-1'
 51 |               '&c=-1&workcityprovince1=-1&workcitycity1=-1&constellation=-1&animals=-1'
 52 |               '&stock=-1&belief=-1&lvBegin=-1&lvEnd=-1&condition=66&orderby=hpf&hotIndex=&online=',
 53 | }
 54 | 
 55 | # Enable or disable spider middlewares
 56 | # See http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
 57 | #SPIDER_MIDDLEWARES = {
 58 | #    'ZhenAi.middlewares.ZhenaiSpiderMiddleware': 543,
 59 | #}
 60 | 
 61 | # Enable or disable downloader middlewares
 62 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
 63 | #DOWNLOADER_MIDDLEWARES = {
 64 | #    'ZhenAi.middlewares.MyCustomDownloaderMiddleware': 543,
 65 | #}
 66 | 
 67 | # Enable or disable extensions
 68 | # See http://scrapy.readthedocs.org/en/latest/topics/extensions.html
 69 | #EXTENSIONS = {
 70 | #    'scrapy.extensions.telnet.TelnetConsole': None,
 71 | #}
 72 | 
 73 | # Configure item pipelines
 74 | # See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html
 75 | ITEM_PIPELINES = {
 76 |    'ZhenAi.pipelines.ZhenAiImagePipline': 300,
 77 | }
 78 | IMAGES_STORE = 'pic'
 79 | IMAGES_EXPIRES = 90
 80 | 
 81 | # Enable and configure the AutoThrottle extension (disabled by default)
 82 | # See http://doc.scrapy.org/en/latest/topics/autothrottle.html
 83 | #AUTOTHROTTLE_ENABLED = True
 84 | # The initial download delay
 85 | #AUTOTHROTTLE_START_DELAY = 5
 86 | # The maximum download delay to be set in case of high latencies
 87 | #AUTOTHROTTLE_MAX_DELAY = 60
 88 | # The average number of requests Scrapy should be sending in parallel to
 89 | # each remote server
 90 | #AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
 91 | # Enable showing throttling stats for every response received:
 92 | #AUTOTHROTTLE_DEBUG = False
 93 | 
 94 | # Enable and configure HTTP caching (disabled by default)
 95 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
 96 | #HTTPCACHE_ENABLED = True
 97 | #HTTPCACHE_EXPIRATION_SECS = 0
 98 | #HTTPCACHE_DIR = 'httpcache'
 99 | #HTTPCACHE_IGNORE_HTTP_CODES = []
100 | #HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
101 | 


--------------------------------------------------------------------------------
/TMallCommentSpider/TMallCommentSpider/spiders/tmall_comment_spider.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | # @Author : Huangcc
  3 | 
  4 | import scrapy
  5 | import config
  6 | import urllib
  7 | import re
  8 | import json
  9 | import time
 10 | import random
 11 | import requests
 12 | import codecs
 13 | import datetime
 14 | 
 15 | 
 16 | class TmallCommentSpider(scrapy.Spider):
 17 |     name = 'tmall_comment_spider'
 18 |     keyword = urllib.quote(config.search_keyword.decode('utf-8').encode('gbk'))
 19 |     search_url = 'https://list.tmall.com/search_product.htm?q={keyword}&type=p&vmarket=' \
 20 |                  '&spm=875.7931836%2FB.a2227oh.d100&from=mallfp..pc_1_searchbutton'.format(keyword=keyword)
 21 |     item_url = 'https://list.tmall.com/search_product.htm?spm=a220m.1000858.0.0.50092370Go9Qa5' \
 22 |                '&s={start_index}&q={keyword}&sort=s&style=g&from=.list.pc_1_searchbutton' \
 23 |                '&type=pc#J_Filter'
 24 |     comment_url = 'https://rate.tmall.com/list_detail_rate.htm'
 25 | 
 26 |     my_headers = {
 27 |         'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:57.0) Gecko/20100101 Firefox/57.0',
 28 |         'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
 29 |         'Accept-Language': 'en-US,en;q=0.5',
 30 |         'Accept-Encoding': 'gzip, deflate, br',
 31 |     }
 32 | 
 33 |     start_urls = [search_url]
 34 | 
 35 |     filename = 'tmall-comments-%s.csv' % (datetime.datetime.now().strftime('%Y%m%d-%H%M'))
 36 | 
 37 |     def parse(self, response):
 38 |         total_page = response.xpath('//input[@name="totalPage"]/@value').extract_first()
 39 |         if total_page:
 40 |             total_page = int(total_page)
 41 |             for page in range(total_page):
 42 |                 start_index = page * 60
 43 |                 page_url = self.item_url.format(start_index=start_index, keyword=self.keyword)
 44 |                 yield scrapy.Request(url=page_url, method='GET', callback=self.parse_search_result,
 45 |                                      headers=self.my_headers)
 46 | 
 47 |     def parse_search_result(self, response):
 48 |         item_urls = response.xpath('//p[@class="productStatus"]//a/@href').extract()
 49 |         comment_nums = response.xpath('//p[@class="productStatus"]//a/text()').extract()
 50 |         for i in range(len(item_urls)):
 51 |             if int(comment_nums[i]):
 52 |                 yield scrapy.Request(url='https:' + item_urls[i], method='GET', callback=self.parse_item,
 53 |                                      headers=self.my_headers)
 54 | 
 55 |     def parse_item(self, response):
 56 |         resp_text = response.text.replace('\n', '')
 57 |         pattern = 'TShop\.Setup\((.*?)\);'
 58 |         result = re.findall(pattern, resp_text)
 59 | 
 60 |         data = {
 61 |             'itemId': '',  # 必填
 62 |             'spuId': '',  # 必填
 63 |             'sellerId': '',  # 必填
 64 |             'order': '3',
 65 |             'currentPage': '1',  # 页
 66 |             'append': '0',
 67 |             'content': '1',
 68 |             'tagId': '',
 69 |             'posi': '',
 70 |             'picture': '',
 71 |             'ua': '',
 72 |             'needFold': '0',
 73 |             '_ksTS': '',  # 毫秒时间戳_四位随机数
 74 |             'callback': '',  # jsonp_四位随机数+1
 75 |         }
 76 | 
 77 |         if result:
 78 |             json_data = json.loads(result[0])
 79 |             item_do = json_data['itemDO']
 80 | 
 81 |             data['itemId'] = item_do['itemId']
 82 |             data['spuId'] = item_do['spuId']
 83 |             data['sellerId'] = item_do['userId']
 84 |             random_int = random.randint(1000, 9998)
 85 |             data['_ksTS'] = '%d_%d' % (time.time() * 1000, random_int),  # 毫秒时间戳_四位随机数
 86 |             data['callback'] = 'jsonp_%d' % (random_int + 1)
 87 | 
 88 |             my_headers = self.my_headers.copy()
 89 |             my_headers['Host'] = 'rate.tmall.com'
 90 |             my_headers['Referer'] = response.url
 91 | 
 92 |             resp = requests.get(self.comment_url, params=data, headers=my_headers)
 93 |             json_str = resp.content[len(data['callback']):-1]
 94 |             json_data = json.loads(json_str)
 95 |             max_pages = json_data['rateDetail']['paginator']['lastPage']
 96 |             for i in range(max_pages):
 97 |                 random_int = random.randint(1000, 9998)
 98 |                 data['_ksTS'] = '%d_%d' % (time.time() * 1000, random_int),  # 毫秒时间戳_四位随机数
 99 |                 data['callback'] = 'jsonp_%d' % (random_int + 1)
100 |                 data['currentPage'] = str(i + 1)
101 | 
102 |                 url = self.comment_url + '?' + urllib.urlencode(data)
103 |                 yield scrapy.Request(url=url, headers=my_headers, callback=self.parse_comment)
104 | 
105 |     def parse_comment(self, response):
106 |         json_data = json.loads(response.text[len('jsonp_9999'):-1])
107 |         rate_list = json_data['rateDetail']['rateList']
108 |         for rate in rate_list:
109 |             user_nickname = rate['displayUserNick']
110 |             user_id = rate['id']
111 |             rate_content = rate['rateContent']
112 |             rate_date = rate['rateDate']
113 | 
114 |             with codecs.open(self.filename, 'a') as f:
115 |                 f.write('|'.join((user_id, user_nickname, rate_content, rate_date)) + '\n')
116 | 


--------------------------------------------------------------------------------
/BaiduZhidaoCommentSpider/baidu_zhidao_comment_spider.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | # @Author : Huangcc
  3 | 
  4 | import requests
  5 | import urllib
  6 | from lxml import etree
  7 | import re
  8 | import datetime
  9 | import codecs
 10 | import time
 11 | import random
 12 | 
 13 | SLEEP = [0.5, 1, 1.5, 2, 2.5, 3]
 14 | 
 15 | 
 16 | class BaiduZhidao():
 17 |     search_url = 'https://zhidao.baidu.com/search?word={keyword}&ie=gbk&site=-1&sites=0&date=0&pn=PAGE'
 18 |     my_headers = {
 19 |         'Host': 'zhidao.baidu.com',
 20 |         'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:57.0) Gecko/20100101 Firefox/57.0',
 21 |         'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
 22 |         'Accept-Language': 'en-US,en;q=0.5',
 23 |         'Accept-Encoding': 'gzip, deflate, br',
 24 |     }
 25 |     comment_url = 'https://zhidao.baidu.com/question/{question_id}.html?sort=9&rn=5&pn=PAGE#wgt-answers'
 26 | 
 27 |     def __init__(self, keyword):
 28 |         self.session = requests.Session()
 29 |         self.keyword = keyword
 30 |         self.search_url = self.search_url.format(keyword=urllib.quote(keyword.decode('utf-8').encode('gbk')))
 31 |         self.question_ids = []
 32 |         self.filename = 'baidu_zhidao-comments-%s.csv' % (datetime.datetime.now().strftime('%Y%m%d-%H%M'))
 33 | 
 34 |     def set_keyword(self, keyword):
 35 |         self.keyword = keyword
 36 | 
 37 |     def reset_filename(self):
 38 |         self.filename = 'baidu_zhidao-comments-%s.csv' % (datetime.datetime.now().strftime('%Y%m%d-%H%M'))
 39 | 
 40 |     @staticmethod
 41 |     def extract_question_id(url):
 42 |         pattern = '/question/(\d+?)\.'
 43 |         result = re.findall(pattern, url)
 44 |         if result:
 45 |             return result[0]
 46 |         else:
 47 |             return None
 48 | 
 49 |     @staticmethod
 50 |     def html_filter(html_text):
 51 |         html_text = html_text.replace('\n', '').replace('\t', ' ')
 52 |         pattern = re.compile(r'<[^>]+>', re.S)
 53 |         no_html_text = pattern.sub('', html_text)
 54 |         return no_html_text
 55 | 
 56 |     def search(self, page=0):
 57 |         print '-*- start search with page %d -*-' % (page / 10 + 1)
 58 |         time.sleep(SLEEP[random.randint(0, len(SLEEP) - 1)])
 59 |         resp = self.session.get(url=self.search_url.replace('PAGE', str(page)), headers=self.my_headers)
 60 |         if resp.status_code == 200:
 61 |             response = etree.HTML(resp.text)
 62 |             urls = response.xpath('//a[@class="ti"]/@href')
 63 |             self.question_ids.extend(filter(lambda x: True if x else False, map(self.extract_question_id, urls)))
 64 | 
 65 |             next_page = response.xpath('//a[@class="pager-next"]/@href')
 66 |             if next_page:
 67 |                 next_page_number = re.findall('&pn=(\d+)$', next_page[0])
 68 |                 if next_page_number:
 69 |                     next_page_number = int(next_page_number[0])
 70 |                 else:
 71 |                     next_page_number = 0
 72 |                 self.search(page=next_page_number)  # 递归调用直到没有下一页
 73 |             else:
 74 |                 print '=*= end search with page %d =*=' % (page / 10 + 1)
 75 |         else:
 76 |             print 'Error status code %d in getting search result with page %d' % (resp.status_code, (page / 10 + 1))
 77 |             print resp.content
 78 | 
 79 |     def print_question_ids(self):
 80 |         print self.question_ids
 81 | 
 82 |     def find_comments(self):
 83 |         total = len(self.question_ids)
 84 |         for i, question_id in enumerate(self.question_ids):
 85 |             print '|*| start get content from question id %s - %d/%d |*|' % (question_id, i + 1, total)
 86 |             url = self.comment_url.format(question_id=question_id)
 87 |             self.comment(url)
 88 |             print '_*_ end get content from question id %s - %d/%d _*_' % (question_id, i + 1, total)
 89 | 
 90 |     def comment(self, url, page=0):
 91 |         print ' * start get comments with page %d *' % (page / 5 + 1)
 92 |         time.sleep(SLEEP[random.randint(0, len(SLEEP) - 1)])
 93 |         resp = self.session.get(url.replace('PAGE', str(page)), headers=self.my_headers, allow_redirects=False)
 94 |         if resp.status_code != 200:
 95 |             print 'Error status code %d in getting comment result with page %d' % (resp.status_code, (page / 5 + 1))
 96 |             print resp.content
 97 |         else:
 98 |             response = etree.HTML(resp.content)
 99 |             comment_nodes = response.xpath('//span[@class="con"]')
100 |             comments = []
101 |             for node in comment_nodes:
102 |                 print node.xpath('string(.)')
103 |                 comments.append(node.xpath('string(.)').strip())
104 |             print ' | get %d comments | ' % len(comments)
105 | 
106 |             # 获取问题
107 |             ask_title = response.xpath('//title/text()')
108 |             if ask_title:
109 |                 ask_title = ask_title[0]
110 |             else:
111 |                 ask_title = ""
112 | 
113 |             if comments:
114 |                 comments = map(self.html_filter, comments)
115 |                 with codecs.open(self.filename, 'a', encoding='utf-8') as f:
116 |                     for data in comments:
117 |                         f.write(ask_title + '|' + data + '\n')
118 | 
119 |             next_page = response.xpath('//a[@class="pager-next"]/@href')
120 |             if next_page:
121 |                 next_page_number = re.findall('&pn=(\d+)#', next_page[0])
122 |                 if next_page_number:
123 |                     next_page_number = int(next_page_number[0])
124 |                 else:
125 |                     next_page_number = 0
126 |                 self.comment(url, next_page_number)  # 递归调用直到没有下一页
127 |             else:
128 |                 print ' - end get comments with page %d -' % (page / 5 + 1)
129 | 
130 | 
131 | if __name__ == '__main__':
132 |     baidu_zhidao = BaiduZhidao('美年大健康')
133 |     baidu_zhidao.search()
134 |     baidu_zhidao.find_comments()
135 | 


--------------------------------------------------------------------------------
/TMallCommentSpider/tmall_comment_spider.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | # @Author : Huangcc
  3 | 
  4 | import urllib
  5 | import re
  6 | import json
  7 | import time
  8 | import random
  9 | import requests
 10 | import codecs
 11 | import datetime
 12 | from lxml import etree
 13 | 
 14 | search_keyword = '美年健康'
 15 | 
 16 | 
 17 | class TmallCommentSpider():
 18 |     name = 'tmall_comment_spider'
 19 |     keyword = urllib.quote(search_keyword.decode('utf-8').encode('gbk'))
 20 |     search_url = 'https://list.tmall.com/search_product.htm?q={keyword}&type=p&vmarket=' \
 21 |                  '&spm=875.7931836%2FB.a2227oh.d100&from=mallfp..pc_1_searchbutton'.format(keyword=keyword)
 22 |     item_url = 'https://list.tmall.com/search_product.htm?spm=a220m.1000858.0.0.50092370Go9Qa5' \
 23 |                '&s={start_index}&q={keyword}&sort=s&style=g&from=.list.pc_1_searchbutton' \
 24 |                '&type=pc#J_Filter'
 25 |     comment_url = 'https://rate.tmall.com/list_detail_rate.htm'
 26 | 
 27 |     my_headers = {
 28 |         'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:57.0) Gecko/20100101 Firefox/57.0',
 29 |         'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
 30 |         'Accept-Language': 'en-US,en;q=0.5',
 31 |         'Accept-Encoding': 'gzip, deflate, br',
 32 |     }
 33 | 
 34 |     start_urls = [search_url]
 35 | 
 36 |     def __init__(self):
 37 |         self.filename = 'tmall-comments-%s.csv' % (datetime.datetime.now().strftime('%Y%m%d-%H%M'))
 38 |         self.session = requests.session()
 39 |         print self.keyword
 40 | 
 41 |     def pre_get(self):
 42 |         self.session.get(url='https://www.tmall.com/')
 43 | 
 44 |     def start_query(self):
 45 |         self.pre_get()
 46 |         my_headers = self.my_headers
 47 |         my_headers['Host'] = 'list.tmall.com'
 48 |         my_headers['Connection'] = 'keep-alive'
 49 |         my_headers['Upgrade-Insecure-Requests'] = '1'
 50 |         my_headers['Referer'] = 'https://www.tmall.com/'
 51 |         for url in self.start_urls:
 52 |             resp = self.session.get(url=url, headers=my_headers, allow_redirects=False)
 53 |             # resp = self.session.get(url=url, headers=my_headers)
 54 |             print resp.url
 55 |             print resp.content
 56 |             self.parse(resp)
 57 | 
 58 |     def parse(self, response):
 59 |         response = etree.HTML(response.content)
 60 |         total_page_selector = response.xpath('//input[@name="totalPage"]')
 61 |         if total_page_selector:
 62 |             total_page = total_page_selector[0].get('value')
 63 |             for page in range(total_page):
 64 |                 start_index = page * 60
 65 |                 page_url = self.item_url.format(start_index=start_index, keyword=self.keyword)
 66 |                 resp = self.session.get(url=page_url, headers=self.my_headers)
 67 |                 self.parse_search_result(resp)
 68 | 
 69 |     def parse_search_result(self, response):
 70 |         response = etree.HTML(response.content)
 71 |         item_urls = response.xpath('//p[@class="productStatus"]//a/@href').extract()
 72 |         comment_nums = response.xpath('//p[@class="productStatus"]//a/text()').extract()
 73 |         for i in range(len(item_urls)):
 74 |             if int(comment_nums[i]):
 75 |                 resp = self.session.get(url='https:' + item_urls[i], headers=self.my_headers)
 76 |                 self.parse_item(resp)
 77 | 
 78 |     def parse_item(self, response):
 79 |         resp_text = response.text.replace('\n', '')
 80 |         pattern = 'TShop\.Setup\((.*?)\);'
 81 |         result = re.findall(pattern, resp_text)
 82 | 
 83 |         data = {
 84 |             'itemId': '',  # 必填
 85 |             'spuId': '',  # 必填
 86 |             'sellerId': '',  # 必填
 87 |             'order': '3',
 88 |             'currentPage': '1',  # 页
 89 |             'append': '0',
 90 |             'content': '1',
 91 |             'tagId': '',
 92 |             'posi': '',
 93 |             'picture': '',
 94 |             'ua': '',
 95 |             'needFold': '0',
 96 |             '_ksTS': '',  # 毫秒时间戳_四位随机数
 97 |             'callback': '',  # jsonp_四位随机数+1
 98 |         }
 99 | 
100 |         if result:
101 |             json_data = json.loads(result[0])
102 |             item_do = json_data['itemDO']
103 | 
104 |             data['itemId'] = item_do['itemId']
105 |             data['spuId'] = item_do['spuId']
106 |             data['sellerId'] = item_do['userId']
107 |             random_int = random.randint(1000, 9998)
108 |             data['_ksTS'] = '%d_%d' % (time.time() * 1000, random_int),  # 毫秒时间戳_四位随机数
109 |             data['callback'] = 'jsonp_%d' % (random_int + 1)
110 | 
111 |             my_headers = self.my_headers.copy()
112 |             my_headers['Host'] = 'rate.tmall.com'
113 |             my_headers['Referer'] = response.url
114 | 
115 |             resp = requests.get(self.comment_url, params=data, headers=my_headers)
116 |             json_str = resp.content[len(data['callback']):-1]
117 |             json_data = json.loads(json_str)
118 |             max_pages = json_data['rateDetail']['paginator']['lastPage']
119 |             for i in range(max_pages):
120 |                 random_int = random.randint(1000, 9998)
121 |                 data['_ksTS'] = '%d_%d' % (time.time() * 1000, random_int),  # 毫秒时间戳_四位随机数
122 |                 data['callback'] = 'jsonp_%d' % (random_int + 1)
123 |                 data['currentPage'] = str(i + 1)
124 | 
125 |                 resp = self.session.get(url=self.comment_url, params=data, headers=my_headers)
126 |                 self.parse_comment(resp)
127 | 
128 |     def parse_comment(self, response):
129 |         json_data = json.loads(response.text[len('jsonp_9999'):-1])
130 |         rate_list = json_data['rateDetail']['rateList']
131 |         for rate in rate_list:
132 |             user_nickname = rate['displayUserNick']
133 |             user_id = rate['id']
134 |             rate_content = rate['rateContent']
135 |             rate_date = rate['rateDate']
136 | 
137 |             with codecs.open(self.filename, 'a') as f:
138 |                 f.write('|'.join((user_id, user_nickname, rate_content, rate_date)) + '\n')
139 | 
140 | if __name__ == '__main__':
141 |     tmall = TmallCommentSpider()
142 |     tmall.start_query()
143 | 


--------------------------------------------------------------------------------
/ZhenAiSpider/ZhenAi/ZhenAi/spiders/zhenai_spider.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | 
  3 | import scrapy
  4 | import json
  5 | import traceback
  6 | import mymongo
  7 | from utils import *
  8 | from ZhenAi.items import ZhenaiItem
  9 | from scrapy.utils.response import get_base_url
 10 | 
 11 | 
 12 | class ZhenaiSpider(scrapy.Spider):
 13 |     name = 'zhenai_spider'
 14 |     # generate all base urls
 15 |     base_url = 'http://search.zhenai.com/v2/search/getPinterestData.do?sex={}&agebegin={}&ageend={}&workcityprovince={}' \
 16 |                '&workcitycity={}&education={}' \
 17 |                '&occupation={}&info=&marriage={}&h1={}&h2={}&salaryBegin={}&salaryEnd={}' \
 18 |                '&h={}&c={}&workcityprovince1={}&workcitycity1={}&constellation={}&animals={}&stock={}&belief={}' \
 19 |                '&lvBegin={}&lvEnd={}&condition=66&orderby=hpf&hotIndex=&online={}&currentpage={}&topSearch=false'
 20 |     sex = [0, 1]
 21 |     agebegin = range(18, 100)
 22 |     agebegin.append(-1)
 23 |     agebegin.reverse()
 24 |     ageend = range(18, 100)
 25 |     ageend.append(-1)
 26 |     ageend.reverse()
 27 |     workcityprovince = [-1]  # TODO
 28 |     workcitycity = [-1]  # TODO
 29 |     education = range(2, 8)
 30 |     education.append(-1)
 31 |     education.reverse()
 32 |     occupation = range(100, 2900, 100)
 33 |     occupation.append(-1)
 34 |     occupation.reverse()
 35 |     marriage = [-1, 1, 3, 4]  # weihun, liyi, sang'ou
 36 |     h1 = range(129, 212)
 37 |     h1.append(-1)
 38 |     h1.reverse()
 39 |     h2 = range(129, 212)
 40 |     h2.append(-1)
 41 |     h2.reverse()
 42 |     salaryBegin = range(103, 109)  # yue shou ru
 43 |     salaryBegin.append(-1)
 44 |     salaryBegin.reverse()
 45 |     salaryEnd = range(103, 109)
 46 |     salaryEnd.append(-1)
 47 |     salaryEnd.reverse()
 48 |     h = range(1, 6)  # house
 49 |     h.append(-1)
 50 |     h.reverse()
 51 |     c = range(1, 6)  # children
 52 |     c.append(-1)
 53 |     c.reverse()
 54 |     workcityprovince1 = [-1]  # TODO
 55 |     workcitycity1 = [-1]  # TODO
 56 |     constellation = range(1, 13)  # xing zuo
 57 |     constellation.append(-1)
 58 |     constellation.reverse()
 59 |     animals = range(1, 13)  # sheng xiao
 60 |     animals.append(-1)
 61 |     animals.reverse()
 62 |     stock = range(1, 58)  # min zu
 63 |     stock.append(-1)
 64 |     stock.reverse()
 65 |     belief = range(1, 14)  # not sure
 66 |     belief.append(-1)
 67 |     belief.reverse()
 68 |     lvBegin = range(1, 8)
 69 |     lvBegin.append(-1)
 70 |     lvBegin.reverse()
 71 |     lvEnd = range(1, 8)
 72 |     lvEnd.append(-1)
 73 |     lvEnd.reverse()
 74 |     online = [-1, 1]
 75 |     currentpage = range(1, 101)
 76 |     start_urls = url_generator(base_url, sex, agebegin, ageend, workcityprovince, workcitycity,
 77 |                                education, occupation, marriage, h1, h2, salaryBegin, salaryEnd, h, c,
 78 |                                workcityprovince1, workcitycity1, constellation, animals, stock, belief, lvBegin, lvEnd,
 79 |                                online, currentpage)
 80 | 
 81 |     def parse(self, response):
 82 |         data = {}
 83 |         try:
 84 |             data = json.loads(response.text)
 85 |         except:
 86 |             traceback.print_exc()
 87 |         if data and 'data' in data:
 88 |             doc_list = data['data']
 89 |             if len(doc_list) > 0:
 90 |                 mongo = mymongo.MyMongo()
 91 |                 mongo.insert_doc('ZhenAi', 'SimpleData', doc_list)
 92 |                 for doc in doc_list:
 93 |                     member_id = doc['memberId']
 94 |                     url = 'http://album.zhenai.com/u/{}?flag=s'.format(member_id)
 95 |                     yield scrapy.Request(url=url, callback=self.parse_detail)
 96 | 
 97 |     def parse_detail(self, response):
 98 |         items = ZhenaiItem()
 99 |         html_selector = scrapy.Selector(response)
100 |         url = get_base_url(response)
101 |         items['pic_url'] = html_selector.xpath(
102 |             '//div[@id="AblumsThumbsListID"]/ul/li/p/img[1]/@data-big-img').extract()
103 | 
104 |         honesty_charm = html_selector.xpath('//p[@class="brief-info fs14 lh32 c9f"]/span/span/text()').extract()
105 |         honesty = '--'
106 |         charm = '--'
107 |         if len(honesty_charm) == 2:
108 |             honesty = honesty_charm[0]
109 |             charm = honesty_charm[1]
110 |         zhima_info = html_selector.xpath(
111 |             '//p[@class="brief-name lh32 blue"]//a[@class="flag-credit credit-js"]/text()').extract_first()
112 |         if zhima_info:
113 |             zhima_info = zhima_info.replace(u'\u5206', '')
114 |             if not zhima_info.isdigit():
115 |                 zhima_info = '--'
116 |         else:
117 |             zhima_info = '--'
118 | 
119 |         brief_table_td = html_selector.xpath(
120 |             '//table[@class="brief-table"]//td').extract()  # ['<td><span>x:</span> y</td>']
121 |         brief_dict = {}
122 |         for td in brief_table_td:
123 |             key, value = get_brief_td_to_key_value(td)
124 |             if key is not None and value is not None:
125 |                 brief_dict[key] = value
126 | 
127 |         nick_name = html_selector.xpath('//a[@class="name fs24"]/text()').extract_first()
128 | 
129 |         id_str = html_selector.xpath('//p[@class="brief-info fs14 lh32 c9f"]/text()').extract_first()
130 |         id = re.findall('ID.*?(\d+)', id_str)[0]
131 | 
132 |         person_os = html_selector.xpath(
133 |             '//div[@class="mod-tab-info"]//div[@class="info-item slider info-inner"]'
134 |             '//p[@class="fs14 lh20 c5e slider-area-js"]/text()').extract()
135 | 
136 |         data_table_td = html_selector.xpath('//div[@class="info-floor floor-data posr clearfix"]//table//td').extract()
137 |         data_dict = {}
138 |         for td in data_table_td:
139 |             key, value = get_info_td_to_key_value(td)
140 |             if key is not None and value is not None:
141 |                 data_dict[key] = value
142 | 
143 |         life_table_td = html_selector.xpath('//div[@class="info-floor floor-life posr clearfix"]//table//td').extract()
144 |         life_dict = {}
145 |         for td in life_table_td:
146 |             key, value = get_info_td_to_key_value(td)
147 |             if key is not None and value is not None:
148 |                 life_dict[key] = value
149 | 
150 |         hobby_table_td = html_selector.xpath(
151 |             '//div[@class="info-floor floor-hobby posr clearfix"]//table//td').extract()
152 |         hobby_dict = {}
153 |         for td in hobby_table_td:
154 |             key, value = get_info_td_to_key_value(td)
155 |             if key is not None and value is not None:
156 |                 hobby_dict[key] = value
157 | 
158 |         term_table_td = html_selector.xpath('//div[@class="info-floor floor-term posr clearfix"]//table//td').extract()
159 |         term_dict = {}
160 |         for td in term_table_td:
161 |             key, value = get_info_td_to_key_value(td)
162 |             if key is not None and value is not None:
163 |                 term_dict[key] = value
164 | 
165 |         all_data = {
166 |             'nick_name': nick_name,
167 |             'url': url,
168 |             'member_id': id,
169 |             'person_os': person_os,
170 |             'honesty': honesty,
171 |             'zhima': zhima_info,
172 |             'charm': charm,
173 |             'brief_data': brief_dict,
174 |             'data': data_dict,
175 |             'life': life_dict,
176 |             'hobby': hobby_dict,
177 |             'term': term_dict,
178 |             'pic_url': items['pic_url']
179 |         }
180 |         mongo = mymongo.MyMongo()
181 |         mongo.insert_doc('ZhenAi', 'CompleteData', all_data)
182 | 
183 |         return items
184 | 


--------------------------------------------------------------------------------
/JDCommentSpider/jdcomment.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | # @Author : Huangcc
  3 | 
  4 | import urllib
  5 | import requests
  6 | import re
  7 | import math
  8 | import time
  9 | import random
 10 | import json
 11 | import codecs
 12 | import datetime
 13 | from collections import OrderedDict
 14 | 
 15 | KEYWORD = '美年健康'
 16 | 
 17 | # code start here
 18 | URL_ENCODE_KEYWORD = urllib.quote(KEYWORD)
 19 | # 搜索页
 20 | SEARCH_REFERER = 'https://search.jd.com/Search?keyword={keyword}&enc=utf-8&qrst=1&rt=1&stop=1&vt=2&wq={keyword}&page={page_keyword}&s={start_item}&click=0'
 21 | # 搜索结果接口
 22 | SEARCH_URL = 'https://search.jd.com/s_new.php'
 23 | # 搜索页Host
 24 | SEARCH_HOST = 'search.jd.com'
 25 | # 评论页
 26 | COMMENT_REFERER = 'https://item.jd.com/ID.html'
 27 | # 评论结果接口
 28 | COMMENT_URL = 'https://sclub.jd.com/comment/productPageComments.action'
 29 | # 评论页Host
 30 | COMMENT_HOST = 'sclub.jd.com'
 31 | 
 32 | CSV_SEQ = '|'  # csv文件分隔符
 33 | 
 34 | # replace用的占位符
 35 | PAGE_KEYWORD = 'PAGE'
 36 | START_ITEM_KEYWORD = 'SS'
 37 | COMMENT_REFERER_KEYWORD = 'ID'
 38 | 
 39 | # 初始化搜索页的referer
 40 | SEARCH_REFERER = SEARCH_REFERER.format(keyword=URL_ENCODE_KEYWORD, page_keyword=PAGE_KEYWORD,
 41 |                                        start_item=START_ITEM_KEYWORD)
 42 | 
 43 | DEFAULT_REQUEST_HEADERS = {
 44 |     # 'Host': 'search.jd.com',
 45 |     'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64; rv:52.0) Gecko/20100101 Firefox/52.0',
 46 |     'Accept': '*/*',
 47 |     'Accept-Language': 'zh-CN,zh;q=0.8,en-US;q=0.5,en;q=0.3',
 48 |     'Accept-Encoding': 'gzip, deflate, br',
 49 |     'X-Requested-With': 'XMLHttpRequest',
 50 |     # 'Referer': BASE_REFERER,
 51 |     'Connection': 'keep-alive',
 52 |     'Pragma': 'no-cache',
 53 |     'Cache-Control': 'no-cache',
 54 | }
 55 | 
 56 | COMMENT_REQUEST_HEADERS = {
 57 |     # 'Host': 'sclub.jd.com',
 58 |     'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64; rv:52.0) Gecko/20100101 Firefox/52.0',
 59 |     'Accept': '*/*',
 60 |     'Accept-Language': 'zh-CN,zh;q=0.8,en-US;q=0.5,en;q=0.3',
 61 |     'Accept-Encoding': 'gzip, deflate, br',
 62 |     # 'Referer': 'https://item.jd.com/12571462129.html',
 63 |     'Connection': 'keep-alive',
 64 | }
 65 | 
 66 | ITEM_REQUEST_HEADERS = {
 67 |     'Host': 'item.jd.com',
 68 |     'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64; rv:52.0) Gecko/20100101 Firefox/52.0',
 69 |     'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
 70 |     'Accept-Language': 'zh-CN,zh;q=0.8,en-US;q=0.5,en;q=0.3',
 71 |     'Accept-Encoding': 'gzip, deflate, br',
 72 |     'Connection': 'keep-alive',
 73 |     'Upgrade-Insecure-Requests': '1',
 74 |     'If-Modified-Since': 'Wed, 08 Nov 2017 05:49:40 GMT',
 75 | }
 76 | 
 77 | DATA = {
 78 |     'keyword': KEYWORD,
 79 |     'enc': 'utf-8',
 80 |     'qrst': '1',
 81 |     'rt': '1',
 82 |     'stop': '1',
 83 |     'vt': '2',
 84 |     'wq': KEYWORD,
 85 |     'page': '1',
 86 |     's': '1',
 87 |     'psort': '4',  # 按照评论数排序
 88 |     'scrolling': 'y',
 89 |     'tpl': '1_M',
 90 |     'log_id': '1510033965.96458',
 91 |     'show_items': '',
 92 | }
 93 | 
 94 | COMMENT_DATA = {
 95 |     # 'callback': 'fetchJSON_comment98vv152',  # 后面三个数字随机
 96 |     # 'productId': '12571462129',  # 商品id
 97 |     'score': '0',
 98 |     'sortType': '5',
 99 |     # 'page': '0',  # 评论第x页
100 |     'pageSize': '10',
101 |     'isShadowSku': '0',
102 |     'fold': '1',
103 | }
104 | 
105 | COMMENT_EXTRA_KEYWORDS = [
106 |     ('id', u'用户ID'),
107 |     ('nickname', u'用户名'),
108 |     ('content', u'评论内容'),
109 |     ('creationTime', u'评论日期'),
110 |     ('score', u'评分'),
111 |     ('referenceName', u'商品名称'),
112 |     ('productColor', u'产品类型'),
113 | ]
114 | 
115 | COMMENT_EXTRA_KEYWORDS_DICT = OrderedDict()
116 | for key, value in COMMENT_EXTRA_KEYWORDS:
117 |     COMMENT_EXTRA_KEYWORDS_DICT[key] = value
118 | 
119 | 
120 | def parse_html_to_get_ids(html_content):
121 |     ids = []
122 |     pattern = '<li data-sku="(\d+)"'  # 正则提取商品id
123 |     results = re.findall(pattern, html_content.replace('\n', '').replace('\t', ''))
124 |     if results:
125 |         ids.extend(results)
126 |     return ids
127 | 
128 | 
129 | def get_item_ids():
130 |     # 从搜索页获取商品ids
131 |     ids = []
132 |     session = requests.session()
133 |     resp = session.get(SEARCH_REFERER.replace(PAGE_KEYWORD, '1').replace(START_ITEM_KEYWORD, '1'))
134 |     if resp.status_code == 200:
135 |         one_line = resp.content.replace('\n', '').replace('\t', '')
136 |         pattern = 'LogParm.*?result_count:(.*?),.*?SEARCH.item_count=(.*?);'  # 商品总数，每页显示的数量
137 |         result = re.findall(pattern, one_line)
138 |         if result:
139 |             result_count = float(result[0][0].replace('"', '').replace("'", ''))
140 |             # print result_count
141 |             item_count = int(result[0][1])
142 |             pages = int(math.ceil(result_count / item_count))
143 | 
144 |             for i in range(pages):
145 |                 # 计算奇数页
146 |                 page = str(2 * i + 1)
147 |                 start_item = str(item_count * i + 1)
148 |                 # 封装请求头
149 |                 referer_url = SEARCH_REFERER.replace(PAGE_KEYWORD, page).replace(START_ITEM_KEYWORD, start_item)
150 |                 headers = DEFAULT_REQUEST_HEADERS.copy()
151 |                 headers['Host'] = SEARCH_HOST
152 |                 headers['Referer'] = referer_url
153 |                 # 构造请求商品列表的数据
154 |                 data = DATA.copy()
155 |                 data['page'] = page
156 |                 data['s'] = start_item
157 |                 # 请求奇数页的数据
158 |                 resp = session.get(SEARCH_URL, params=data, headers=headers, allow_redirects=False)
159 |                 if resp.status_code == 200:
160 |                     odd_ids = parse_html_to_get_ids(resp.content)
161 |                     ids.extend(odd_ids)
162 |                     # 请求偶数页的数据
163 |                     # # 构造请求商品列表的数据：通过show_items把奇数页的id组成的字符串传过去获取剩余部分
164 |                     data['page'] = str(int(page) + 1)
165 |                     data['s'] = str(int(start_item) + item_count)
166 |                     data['show_items'] = ','.join(odd_ids)
167 |                     # # 开始请求偶数页数据
168 |                     resp = session.get(SEARCH_URL, params=data, headers=headers, allow_redirects=False)
169 |                     if resp.status_code == 200:
170 |                         even_ids = parse_html_to_get_ids(resp.content)
171 |                         ids.extend(even_ids)
172 |     return ids
173 | 
174 | 
175 | def curl_comments(ids):
176 |     # 构建输出文件
177 |     filename = '%s-comments.csv' % (datetime.datetime.now().strftime('%Y%m%d-%H%M'))
178 |     # 写入表头
179 |     with codecs.open(filename, 'wb', 'GBK') as f:
180 |         f.write(CSV_SEQ.join(COMMENT_EXTRA_KEYWORDS_DICT.itervalues()) + u'\n')
181 | 
182 |     # 构建请求头
183 |     comment_headers = COMMENT_REQUEST_HEADERS.copy()
184 |     comment_headers['Host'] = 'sclub.jd.com'
185 | 
186 |     session = requests.Session()
187 |     for id_ in ids:
188 |         comment_headers['Referer'] = COMMENT_REFERER.replace(COMMENT_REFERER_KEYWORD, id_)
189 |         # 构造请求数据
190 |         data = COMMENT_DATA.copy()
191 |         data['callback'] = 'fetchJSON_comment98vv' + str(random.randint(100, 999))
192 |         data['productId'] = id_
193 |         data['page'] = '0'
194 |         # 请求第一页评论获得总页数
195 |         resp = session.get(COMMENT_URL, params=data, headers=comment_headers)
196 |         if resp.status_code == 200:
197 |             resp.encoding = 'utf-8'
198 |             raw_data = resp.text
199 |             raw_data = raw_data[len('fetchJSON_comment98vv152('):-2]  # 去掉前一部分和后面一部分来得到json字符串
200 |             json_data = json.loads(raw_data)
201 |             max_pages = json_data['maxPage']
202 |             # 请求所有评论页
203 |             for i in range(max_pages):
204 |                 data['page'] = str(i)
205 |                 resp = session.get(COMMENT_URL, params=data, headers=comment_headers)
206 |                 if resp.status_code == 200:
207 |                     raw_data = resp.content.decode(resp.encoding, errors='ignore')  # 忽略无法解析的字符
208 |                     raw_data = raw_data[len('fetchJSON_comment98vv152('):-2]  # 去掉前一部分和后面一部分来得到json字符串
209 |                     json_data = json.loads(raw_data)
210 |                     comment_list = json_data['comments']
211 | 
212 |                     with codecs.open(filename, 'a', 'GBK') as f:
213 |                         for comment in comment_list:
214 |                             result_list = [unicode(comment[key]).replace('\n', '') if key in comment else "" for key in
215 |                                            COMMENT_EXTRA_KEYWORDS_DICT]
216 |                             f.write(CSV_SEQ.join(result_list) + '\n')
217 | 
218 | 
219 | if __name__ == '__main__':
220 |     t = time.time()
221 |     good_ids = get_item_ids()
222 |     curl_comments(good_ids)
223 |     print time.time() - t
224 | 


--------------------------------------------------------------------------------
/JDCommentSpider/README.md:
--------------------------------------------------------------------------------
  1 | # 京东商品评论爬虫
  2 | 
  3 | ## 工作原理
  4 | 爬虫根据关键字，在京东搜索页面上搜索出相关商品，从而得到对应的商品id.
  5 | 然后通过商品id来对相应的评论页进行爬取
  6 | 
  7 | ## 使用方法
  8 | 在`jdcomment.py`里面更改`KEYWORD`的值来对不同商品的评论进行爬取
  9 | 
 10 | ## 京东api说明
 11 | ### 商品搜索api
 12 | 1. `SEARCH_URL`接口会校验请求头中的`Host`, `X-Requested-With`和`Referer`部分，所以相关参数要严格设置
 13 | 2. 京东商品加载方式是典型的二段式，通过检查元素发现，渲染出来的html代码页数是奇数的等差数列`1,3,5,7...`
 14 | 在接口中需要进行两次请求才能把所有商品信息获取到
 15 | 3. 京东请求奇数页返回30个结果，请求偶数页需要把前30个结果的商品id组成逗号分隔的字符串传回来获取剩余的30个商品
 16 | 
 17 | #### url
 18 | ```python
 19 | # 搜索页
 20 | SEARCH_REFERER = 'https://search.jd.com/Search?keyword={keyword}&enc=utf-8&qrst=1&rt=1&stop=1&vt=2&wq={keyword}&page={page_keyword}&s={start_item}&click=0'
 21 | # 搜索结果接口
 22 | SEARCH_URL = 'https://search.jd.com/s_new.php'
 23 | ```
 24 | #### headers
 25 | ```python
 26 | DEFAULT_REQUEST_HEADERS = {
 27 |     'Host': 'search.jd.com',
 28 |     'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64; rv:52.0) Gecko/20100101 Firefox/52.0',
 29 |     'Accept': '*/*',
 30 |     'Accept-Language': 'zh-CN,zh;q=0.8,en-US;q=0.5,en;q=0.3',
 31 |     'Accept-Encoding': 'gzip, deflate, br',
 32 |     'X-Requested-With': 'XMLHttpRequest',
 33 |     'Referer': SEARCH_REFERER,
 34 |     'Connection': 'keep-alive',
 35 |     'Pragma': 'no-cache',
 36 |     'Cache-Control': 'no-cache',
 37 | }
 38 | ```
 39 | #### params
 40 | ```python
 41 | DATA = {
 42 |     'keyword': KEYWORD,
 43 |     'enc': 'utf-8',
 44 |     'qrst': '1',
 45 |     'rt': '1',
 46 |     'stop': '1',
 47 |     'vt': '2',
 48 |     'wq': KEYWORD,
 49 |     'page': PAGE,
 50 |     's': START_ITEM,
 51 |     'psort': '4',  # 按照评论数排序
 52 |     'scrolling': 'y',
 53 |     'tpl': '1_M',
 54 |     'log_id': '1510033965.96458',
 55 |     'show_items': '',  # 偶数页需要把奇数页获取的商品id作为参数传回
 56 | }
 57 | ```
 58 | #### response
 59 | 响应是html代码
 60 | ```html
 61 | <li data-sku="1528270" class="gl-item">
 62 | 	<div class="gl-i-wrap">
 63 | 	    <div class="p-img">
 64 | 		    <a target="_blank" title="【258元，前600名领券再半价！11日0点开抢，敬请期待！更多1元5折产品攻略，点此直达！】" href="//item.jd.com/1528270.html" onclick="searchlog(1,1528270,30,2,'','flagsClk=1614811784')">
 65 | 			    <img width="220" height="220" class="err-product" data-img="1" src="//img10.360buyimg.com/n7/jfs/t11632/232/1337422464/351394/2f2e6b8b/5a003401N58324b9b.jpg" />
 66 |             </a>
 67 |         <div data-catid="14204" data-venid="1000006842" data-presale="0" class="picon" style="background:url(//img30.360buyimg.com/jgsq-productsoa/jfs/t10744/359/434392933/1550/64b961e/59cf1f1cN6dabcc8b.png) no-repeat 0 0;_background-image:none;_filter:progid:DXImageTransform.Microsoft.AlphaImageLoader(src='//img30.360buyimg.com/jgsq-productsoa/jfs/t10744/359/434392933/1550/64b961e/59cf1f1cN6dabcc8b.png',sizingMethod='noscale');"></div>
 68 | 		</div>
 69 | 		<div class="p-price">
 70 |             <strong class="J_1528270" data-done="1"><em>￥</em><i>291.00</i></strong>
 71 |         </div>
 72 | 		<div class="p-name p-name-type-2">
 73 | 			<a target="_blank" title="【258元，前600名领券再半价！11日0点开抢，敬请期待！更多1元5折产品攻略，点此直达！】" href="//item.jd.com/1528270.html" onclick="searchlog(1,1528270,30,1,'','flagsClk=1614811784')">
 74 | 				<em><img class="p-tag3" src="//img14.360buyimg.com/uba/jfs/t6919/268/501386350/1257/92e5fb39/5976fcf9Nd915775f.png" />爱康国宾（ikang）<font class="skcolor_ljg">体检</font>卡 常规基础<font class="skcolor_ljg">套餐</font> 全国门店通用</em>
 75 | 				<i class="promo-words" id="J_AD_1528270">【258元，前600名领券再半价！11日0点开抢，敬请期待！更多1元5折产品攻略，点此直达！】</i>
 76 | 			</a>
 77 | 		</div>
 78 | 		<div class="p-commit">
 79 | 			<strong><a id="J_comment_1528270" target="_blank" href="//item.jd.com/1528270.html#comment" onclick="searchlog(1,1528270,30,3,'','flagsClk=1614811784')">1000+</a>条评价</strong>
 80 | 		</div>
 81 | 		<div class="p-shop" data-selfware="1" data-score="5" data-reputation="97">
 82 |             <span class="J_im_icon"><a target="_blank" class="curr-shop" onclick="searchlog(1,1000006842,0,58)" href="//mall.jd.com/index-1000006842.html" title="爱康国宾自营旗舰店">爱康国宾自营旗舰店</a></span>					</div>
 83 | 			<div class="p-icons" id="J_pro_1528270" data-done="1">
 84 | 				<i class="goods-icons J-picon-tips J-picon-fix" data-idx="1" data-tips="京东自营，品质保障">自营</i>
 85 | 			</div>
 86 | 		<div class="p-operate">
 87 | 			<a class="p-o-btn contrast J_contrast" data-sku="1528270" href="javascript:;" onclick="searchlog(1,1528270,30,6,'','flagsClk=1614811784')"><i></i>对比</a>
 88 | 			<a class="p-o-btn focus J_focus" data-sku="1528270" href="javascript:;" onclick="searchlog(1,1528270,30,5,'','flagsClk=1614811784')"><i></i>关注</a>
 89 | 			<a class="p-o-btn addcart" href="//cart.jd.com/gate.action?pid=1528270&pcount=1&ptype=1" target="_blank" onclick="searchlog(1,1528270,30,4,'','flagsClk=1614811784')" data-limit="0"><i></i>加入购物车</a>
 90 | 		</div>
 91 | 	</div>
 92 | </li>
 93 | ```
 94 | 
 95 | ### 评论搜索api
 96 | 1. `COMMENT_URL`接口会校验请求头中的`Host`和`Referer`部分，所以相关参数要严格设置
 97 | 2. `page`从0开始，不分奇偶页
 98 | 
 99 | #### url
100 | ```python
101 | # 评论页
102 | COMMENT_REFERER = 'https://item.jd.com/ID.html'
103 | # 评论结果接口
104 | COMMENT_URL = 'https://sclub.jd.com/comment/productPageComments.action'
105 | ```
106 | #### headers
107 | ```python
108 | COMMENT_REQUEST_HEADERS = {
109 |     'Host': 'sclub.jd.com',
110 |     'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64; rv:52.0) Gecko/20100101 Firefox/52.0',
111 |     'Accept': '*/*',
112 |     'Accept-Language': 'zh-CN,zh;q=0.8,en-US;q=0.5,en;q=0.3',
113 |     'Accept-Encoding': 'gzip, deflate, br',
114 |     'Referer': 'https://item.jd.com/12571462129.html',  # 对应商品的页面
115 |     'Connection': 'keep-alive',
116 | }
117 | ```
118 | #### params
119 | ```python
120 | COMMENT_DATA = {
121 |     'callback': 'fetchJSON_comment98vv152',  # 后面三个数字随机
122 |     'productId': '12571462129',  # 商品id
123 |     'score': '0',
124 |     'sortType': '5',
125 |     'page': '0',  # 评论第x页
126 |     'pageSize': '10',
127 |     'isShadowSku': '0',
128 |     'fold': '1',
129 | }
130 | ```
131 | #### response
132 | ```javascript
133 | fetchJSON_comment98vv122({
134 | 	"productAttr": null,
135 | 	"productCommentSummary": {
136 | 		"goodRateShow": 97,
137 | 		"poorRateShow": 1,
138 | 		"poorCountStr": "10+",
139 | 		"averageScore": 5,
140 | 		"generalCountStr": "20+",
141 | 		"oneYear": 0,
142 | 		"showCount": 80,
143 | 		"showCountStr": "80+",
144 | 		"goodCount": 1000,
145 | 		"generalRate": 0.021,
146 | 		"generalCount": 20,
147 | 		"skuId": 2667959,
148 | 		"goodCountStr": "1000+",
149 | 		"poorRate": 0.011,
150 | 		"afterCount": 3,
151 | 		"goodRateStyle": 145,
152 | 		"poorCount": 10,
153 | 		"skuIds": null,
154 | 		"poorRateStyle": 2,
155 | 		"generalRateStyle": 3,
156 | 		"commentCountStr": "1000+",
157 | 		"commentCount": 1000,
158 | 		"productId": 2667959,
159 | 		"afterCountStr": "3",
160 | 		"defaultGoodCount": 300,
161 | 		"goodRate": 0.968,
162 | 		"generalRateShow": 2,
163 | 		"defaultGoodCountStr": "300+"
164 | 	},
165 | 	"hotCommentTagStatistics": [{
166 | 		"id": "1467260",
167 | 		"name": "送货快",
168 | 		"status": 0,
169 | 		"rid": "16727",
170 | 		"productId": 2667959,
171 | 		"count": 1,
172 | 		"modified": "2017-01-09 11:44:35",
173 | 		"type": 0,
174 | 		"canBeFiltered": false
175 | 	}],
176 | 	"jwotestProduct": "99",
177 | 	"maxPage": 66,
178 | 	"score": 0,
179 | 	"soType": 5,
180 | 	"imageListCount": 116,
181 | 	"vTagStatistics": null,
182 | 	"comments": [{
183 | 		"id": 10827600829,
184 | 		"guid": "095f5c63-6c5d-4e7b-af58-5fe05c2003c7",
185 | 		"content": "检查完了，本人是在广州使用，服务态度没得说，非常好，每个科室医生检查详细，还有送早餐，以后每年检查都会去爱康国宾。",
186 | 		"creationTime": "2017-10-01 17:54:31",
187 | 		"isTop": false,
188 | 		"referenceId": "2667959",
189 | 		"referenceImage": "jfs/t7381/348/4133538107/380092/22e1dcab/59ffb879N4d2e5230.jpg",
190 | 		"referenceName": "爱康国宾（ikang）体检卡 深爱老公老婆体检套餐 全国门店通用",
191 | 		"referenceTime": "2017-09-07 18:29:05",
192 | 		"referenceType": "Product",
193 | 		"referenceTypeId": 0,
194 | 		"firstCategory": 9192,
195 | 		"secondCategory": 14203,
196 | 		"thirdCategory": 14204,
197 | 		"replyCount": 0,
198 | 		"score": 5,
199 | 		"status": 1,
200 | 		"title": "",
201 | 		"usefulVoteCount": 1,
202 | 		"uselessVoteCount": 0,
203 | 		"userImage": "misc.360buyimg.com/user/myjd-2015/css/i/peisong.jpg",
204 | 		"userImageUrl": "misc.360buyimg.com/user/myjd-2015/css/i/peisong.jpg",
205 | 		"userLevelId": "105",
206 | 		"userProvince": "",
207 | 		"viewCount": 0,
208 | 		"orderId": 0,
209 | 		"isReplyGrade": false,
210 | 		"nickname": "帅***飞",
211 | 		"userClient": 2,
212 | 		"images": [{
213 | 			"id": 415870745,
214 | 			"associateId": 263301426,
215 | 			"productId": 0,
216 | 			"imgUrl": "//img30.360buyimg.com/n0/s128x96_jfs/t10873/14/490911554/1437197/5ba03569/59d11223Na36b22bd.jpg",
217 | 			"available": 1,
218 | 			"pin": "",
219 | 			"dealt": 0,
220 | 			"imgTitle": "",
221 | 			"isMain": 0,
222 | 			"jShow": 0
223 | 		}],
224 | 		"showOrderComment": {
225 | 			"id": 263301426,
226 | 			"guid": "9a24cb67-b7e0-40a6-ac1c-9960ff5d2707",
227 | 			"content": "检查完了，本人是在广州使用，服务态度没得说，非常好，每个科室医生检查详细，还有送早餐，以后每年检查都会去爱康国宾。",
228 | 			"creationTime": "2017-10-02 00:04:51",
229 | 			"isTop": false,
230 | 			"referenceId": "2667959",
231 | 			"referenceType": "Order",
232 | 			"referenceTypeId": 0,
233 | 			"firstCategory": 0,
234 | 			"secondCategory": 0,
235 | 			"thirdCategory": 0,
236 | 			"replyCount": 0,
237 | 			"score": 0,
238 | 			"status": 1,
239 | 			"usefulVoteCount": 0,
240 | 			"uselessVoteCount": 0,
241 | 			"userProvince": "",
242 | 			"viewCount": 0,
243 | 			"orderId": 0,
244 | 			"isReplyGrade": false,
245 | 			"userClient": 2,
246 | 			"isDeal": 1,
247 | 			"integral": -20,
248 | 			"userImgFlag": 0,
249 | 			"anonymousFlag": 1,
250 | 			"recommend": false,
251 | 			"userLevelColor": "#666666",
252 | 			"userClientShow": "来自京东iPhone客户端",
253 | 			"isMobile": true
254 | 		},
255 | 		"mergeOrderStatus": 2,
256 | 		"discussionId": 263301426,
257 | 		"productColor": "深爱老公老婆",
258 | 		"productSize": "",
259 | 		"imageCount": 3,
260 | 		"integral": -20,
261 | 		"userImgFlag": 0,
262 | 		"anonymousFlag": 1,
263 | 		"userLevelName": "PLUS会员",
264 | 		"plusAvailable": 201,
265 | 		"userExpValue": 31721,
266 | 		"productSales": [],
267 | 		"recommend": true,
268 | 		"userLevelColor": "#e1a10a",
269 | 		"userClientShow": "来自京东iPhone客户端",
270 | 		"isMobile": true,
271 | 		"days": 24,
272 | 		"afterDays": 0
273 | 	},
274 | 	]
275 | });
276 | ```


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
  1 |                                  Apache License
  2 |                            Version 2.0, January 2004
  3 |                         http://www.apache.org/licenses/
  4 | 
  5 |    TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
  6 | 
  7 |    1. Definitions.
  8 | 
  9 |       "License" shall mean the terms and conditions for use, reproduction,
 10 |       and distribution as defined by Sections 1 through 9 of this document.
 11 | 
 12 |       "Licensor" shall mean the copyright owner or entity authorized by
 13 |       the copyright owner that is granting the License.
 14 | 
 15 |       "Legal Entity" shall mean the union of the acting entity and all
 16 |       other entities that control, are controlled by, or are under common
 17 |       control with that entity. For the purposes of this definition,
 18 |       "control" means (i) the power, direct or indirect, to cause the
 19 |       direction or management of such entity, whether by contract or
 20 |       otherwise, or (ii) ownership of fifty percent (50%) or more of the
 21 |       outstanding shares, or (iii) beneficial ownership of such entity.
 22 | 
 23 |       "You" (or "Your") shall mean an individual or Legal Entity
 24 |       exercising permissions granted by this License.
 25 | 
 26 |       "Source" form shall mean the preferred form for making modifications,
 27 |       including but not limited to software source code, documentation
 28 |       source, and configuration files.
 29 | 
 30 |       "Object" form shall mean any form resulting from mechanical
 31 |       transformation or translation of a Source form, including but
 32 |       not limited to compiled object code, generated documentation,
 33 |       and conversions to other media types.
 34 | 
 35 |       "Work" shall mean the work of authorship, whether in Source or
 36 |       Object form, made available under the License, as indicated by a
 37 |       copyright notice that is included in or attached to the work
 38 |       (an example is provided in the Appendix below).
 39 | 
 40 |       "Derivative Works" shall mean any work, whether in Source or Object
 41 |       form, that is based on (or derived from) the Work and for which the
 42 |       editorial revisions, annotations, elaborations, or other modifications
 43 |       represent, as a whole, an original work of authorship. For the purposes
 44 |       of this License, Derivative Works shall not include works that remain
 45 |       separable from, or merely link (or bind by name) to the interfaces of,
 46 |       the Work and Derivative Works thereof.
 47 | 
 48 |       "Contribution" shall mean any work of authorship, including
 49 |       the original version of the Work and any modifications or additions
 50 |       to that Work or Derivative Works thereof, that is intentionally
 51 |       submitted to Licensor for inclusion in the Work by the copyright owner
 52 |       or by an individual or Legal Entity authorized to submit on behalf of
 53 |       the copyright owner. For the purposes of this definition, "submitted"
 54 |       means any form of electronic, verbal, or written communication sent
 55 |       to the Licensor or its representatives, including but not limited to
 56 |       communication on electronic mailing lists, source code control systems,
 57 |       and issue tracking systems that are managed by, or on behalf of, the
 58 |       Licensor for the purpose of discussing and improving the Work, but
 59 |       excluding communication that is conspicuously marked or otherwise
 60 |       designated in writing by the copyright owner as "Not a Contribution."
 61 | 
 62 |       "Contributor" shall mean Licensor and any individual or Legal Entity
 63 |       on behalf of whom a Contribution has been received by Licensor and
 64 |       subsequently incorporated within the Work.
 65 | 
 66 |    2. Grant of Copyright License. Subject to the terms and conditions of
 67 |       this License, each Contributor hereby grants to You a perpetual,
 68 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 69 |       copyright license to reproduce, prepare Derivative Works of,
 70 |       publicly display, publicly perform, sublicense, and distribute the
 71 |       Work and such Derivative Works in Source or Object form.
 72 | 
 73 |    3. Grant of Patent License. Subject to the terms and conditions of
 74 |       this License, each Contributor hereby grants to You a perpetual,
 75 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 76 |       (except as stated in this section) patent license to make, have made,
 77 |       use, offer to sell, sell, import, and otherwise transfer the Work,
 78 |       where such license applies only to those patent claims licensable
 79 |       by such Contributor that are necessarily infringed by their
 80 |       Contribution(s) alone or by combination of their Contribution(s)
 81 |       with the Work to which such Contribution(s) was submitted. If You
 82 |       institute patent litigation against any entity (including a
 83 |       cross-claim or counterclaim in a lawsuit) alleging that the Work
 84 |       or a Contribution incorporated within the Work constitutes direct
 85 |       or contributory patent infringement, then any patent licenses
 86 |       granted to You under this License for that Work shall terminate
 87 |       as of the date such litigation is filed.
 88 | 
 89 |    4. Redistribution. You may reproduce and distribute copies of the
 90 |       Work or Derivative Works thereof in any medium, with or without
 91 |       modifications, and in Source or Object form, provided that You
 92 |       meet the following conditions:
 93 | 
 94 |       (a) You must give any other recipients of the Work or
 95 |           Derivative Works a copy of this License; and
 96 | 
 97 |       (b) You must cause any modified files to carry prominent notices
 98 |           stating that You changed the files; and
 99 | 
100 |       (c) You must retain, in the Source form of any Derivative Works
101 |           that You distribute, all copyright, patent, trademark, and
102 |           attribution notices from the Source form of the Work,
103 |           excluding those notices that do not pertain to any part of
104 |           the Derivative Works; and
105 | 
106 |       (d) If the Work includes a "NOTICE" text file as part of its
107 |           distribution, then any Derivative Works that You distribute must
108 |           include a readable copy of the attribution notices contained
109 |           within such NOTICE file, excluding those notices that do not
110 |           pertain to any part of the Derivative Works, in at least one
111 |           of the following places: within a NOTICE text file distributed
112 |           as part of the Derivative Works; within the Source form or
113 |           documentation, if provided along with the Derivative Works; or,
114 |           within a display generated by the Derivative Works, if and
115 |           wherever such third-party notices normally appear. The contents
116 |           of the NOTICE file are for informational purposes only and
117 |           do not modify the License. You may add Your own attribution
118 |           notices within Derivative Works that You distribute, alongside
119 |           or as an addendum to the NOTICE text from the Work, provided
120 |           that such additional attribution notices cannot be construed
121 |           as modifying the License.
122 | 
123 |       You may add Your own copyright statement to Your modifications and
124 |       may provide additional or different license terms and conditions
125 |       for use, reproduction, or distribution of Your modifications, or
126 |       for any such Derivative Works as a whole, provided Your use,
127 |       reproduction, and distribution of the Work otherwise complies with
128 |       the conditions stated in this License.
129 | 
130 |    5. Submission of Contributions. Unless You explicitly state otherwise,
131 |       any Contribution intentionally submitted for inclusion in the Work
132 |       by You to the Licensor shall be under the terms and conditions of
133 |       this License, without any additional terms or conditions.
134 |       Notwithstanding the above, nothing herein shall supersede or modify
135 |       the terms of any separate license agreement you may have executed
136 |       with Licensor regarding such Contributions.
137 | 
138 |    6. Trademarks. This License does not grant permission to use the trade
139 |       names, trademarks, service marks, or product names of the Licensor,
140 |       except as required for reasonable and customary use in describing the
141 |       origin of the Work and reproducing the content of the NOTICE file.
142 | 
143 |    7. Disclaimer of Warranty. Unless required by applicable law or
144 |       agreed to in writing, Licensor provides the Work (and each
145 |       Contributor provides its Contributions) on an "AS IS" BASIS,
146 |       WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 |       implied, including, without limitation, any warranties or conditions
148 |       of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 |       PARTICULAR PURPOSE. You are solely responsible for determining the
150 |       appropriateness of using or redistributing the Work and assume any
151 |       risks associated with Your exercise of permissions under this License.
152 | 
153 |    8. Limitation of Liability. In no event and under no legal theory,
154 |       whether in tort (including negligence), contract, or otherwise,
155 |       unless required by applicable law (such as deliberate and grossly
156 |       negligent acts) or agreed to in writing, shall any Contributor be
157 |       liable to You for damages, including any direct, indirect, special,
158 |       incidental, or consequential damages of any character arising as a
159 |       result of this License or out of the use or inability to use the
160 |       Work (including but not limited to damages for loss of goodwill,
161 |       work stoppage, computer failure or malfunction, or any and all
162 |       other commercial damages or losses), even if such Contributor
163 |       has been advised of the possibility of such damages.
164 | 
165 |    9. Accepting Warranty or Additional Liability. While redistributing
166 |       the Work or Derivative Works thereof, You may choose to offer,
167 |       and charge a fee for, acceptance of support, warranty, indemnity,
168 |       or other liability obligations and/or rights consistent with this
169 |       License. However, in accepting such obligations, You may act only
170 |       on Your own behalf and on Your sole responsibility, not on behalf
171 |       of any other Contributor, and only if You agree to indemnify,
172 |       defend, and hold each Contributor harmless for any liability
173 |       incurred by, or claims asserted against, such Contributor by reason
174 |       of your accepting any such warranty or additional liability.
175 | 
176 |    END OF TERMS AND CONDITIONS
177 | 
178 |    APPENDIX: How to apply the Apache License to your work.
179 | 
180 |       To apply the Apache License to your work, attach the following
181 |       boilerplate notice, with the fields enclosed by brackets "{}"
182 |       replaced with your own identifying information. (Don't include
183 |       the brackets!)  The text should be enclosed in the appropriate
184 |       comment syntax for the file format. We also recommend that a
185 |       file or class name and description of purpose be included on the
186 |       same "printed page" as the copyright notice for easier
187 |       identification within third-party archives.
188 | 
189 |    Copyright {yyyy} {name of copyright owner}
190 | 
191 |    Licensed under the Apache License, Version 2.0 (the "License");
192 |    you may not use this file except in compliance with the License.
193 |    You may obtain a copy of the License at
194 | 
195 |        http://www.apache.org/licenses/LICENSE-2.0
196 | 
197 |    Unless required by applicable law or agreed to in writing, software
198 |    distributed under the License is distributed on an "AS IS" BASIS,
199 |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200 |    See the License for the specific language governing permissions and
201 |    limitations under the License.
202 | 


--------------------------------------------------------------------------------
/ZhihuSpider/zhihu_spider.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | # @Author : Huangcc
  3 | 
  4 | import requests
  5 | import json
  6 | import re
  7 | import base64
  8 | import hmac
  9 | import hashlib
 10 | import time
 11 | import datetime
 12 | import codecs
 13 | from PIL import Image
 14 | 
 15 | import sys
 16 | 
 17 | reload(sys)
 18 | sys.setdefaultencoding("utf-8")
 19 | 
 20 | 
 21 | class Zhihu(object):
 22 |     sigup_url = 'https://www.zhihu.com/signup'
 23 |     home_url = 'https://www.zhihu.com'
 24 |     sigin_url = 'https://www.zhihu.com/api/v3/oauth/sign_in'
 25 |     search_url = 'https://www.zhihu.com/search'
 26 |     search_api_url = "https://www.zhihu.com/api/v4/search_v3"
 27 |     question_page_url = 'https://www.zhihu.com/question/'
 28 |     comment_api_url = 'https://www.zhihu.com/api/v4/questions/{question_id}/answers'
 29 | 
 30 |     client_id = 'c3cef7c66a1843f8b3a9e6a1e3160e20'
 31 |     authorization = 'oauth ' + client_id
 32 | 
 33 |     user_agent = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) ' \
 34 |                  'Chrome/64.0.3282.186 Safari/537.36'
 35 | 
 36 |     simple_headers = {
 37 |         'User-Agent': user_agent,
 38 |     }
 39 | 
 40 |     headers_sigup = {
 41 |         'User-Agent': user_agent,
 42 |         'Connection': 'keep-alive',
 43 |         'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
 44 |         'Accept-Encoding': 'gzip, deflate, br',
 45 |         'Accept-Language': 'zh-CN,zh;q=0.9',
 46 |         'Host': 'www.zhihu.com'
 47 |     }
 48 | 
 49 |     headers_sigin = {
 50 |         'authorization': authorization,
 51 |         'Referer': sigup_url,
 52 |         'Origin': home_url,
 53 |         'User-Agent': user_agent,
 54 |     }
 55 | 
 56 |     headers_captcha = {
 57 |         'authorization': authorization,
 58 |         'Referer': sigup_url,
 59 |         'User-Agent': user_agent,
 60 |     }
 61 | 
 62 |     login_payload = {
 63 |         'client_id': client_id,
 64 |         'grant_type': 'password',
 65 |         'source': 'com.zhihu.web',
 66 |         'lang': 'en',
 67 |         'ref_source': 'other',
 68 |         'utm_source': None,
 69 |     }
 70 | 
 71 |     search_api_headers = {
 72 |         'accept': 'application/json, text/plain, */*',
 73 |         'Accept-Encoding': 'gzip, deflate, br',
 74 |         'Accept-Language': 'zh-CN,zh;q=0.9',
 75 |         # 'authorization': 'Bearer 2|1:0|10:1520824661|4:z_c0|92:Mi4xOW43QUF3QUFBQUFBVU1JQkxabUREQ1lBQUFCZ0FsVk5WRC1UV3dEY1hZZzl0QjRxVDFHSmpKbFFCY2NpT0lqVlNR|ae27d5db5fb5be6be4a9e8dcfb871161169b9cd00eb9265341366fbadabffaca',
 76 |         # 'Cookie': '_zap=a5e45e29-dbb0-4bcc-b318-23dfca5fd933; q_c1=bf06d2672d984917b2f06efa033cc30f|1505131157000|1502242612000; d_c0="AFDCAS2ZgwyPTmHKenJ488LtpT5Eu0sVI_o=|1507769845"; __utma=51854390.746326380.1507769846.1512042244.1512536288.6; __utmz=51854390.1512536288.6.6.utmcsr=zhihu.com|utmccn=(referral)|utmcmd=referral|utmcct=/question/41295948; __utmv=51854390.000--|2=registration_date=20161129=1^3=entry_date=20170809=1; q_c1=bf06d2672d984917b2f06efa033cc30f|1520305668000|1502242612000; aliyungf_tc=AQAAAIhMQwyLaQkA7G4Ot+e2JkBHD3BH; _xsrf=23549cd1-24ba-47c3-be8b-cd31a966c66b; capsion_ticket="2|1:0|10:1520824649|14:capsion_ticket|44:NjE3ZGEyYjkzNTIwNDJiNGFjZDdiMjIyMWFhMGYxMjA=|40c554d334082285d04501cd6739f752f3e07145745d495bfcd32c00ebf5d5f1"; z_c0="2|1:0|10:1520824661|4:z_c0|92:Mi4xOW43QUF3QUFBQUFBVU1JQkxabUREQ1lBQUFCZ0FsVk5WRC1UV3dEY1hZZzl0QjRxVDFHSmpKbFFCY2NpT0lqVlNR|ae27d5db5fb5be6be4a9e8dcfb871161169b9cd00eb9265341366fbadabffaca"',
 77 |         'Host': 'www.zhihu.com',
 78 |         # 'Referer': 'https://www.zhihu.com/search?type=content&q=python',
 79 |         'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.146 Safari/537.36',
 80 |         'X-API-Version': '3.0.91',
 81 |         'X-App-Za': 'OS=Web',
 82 |         # 'X-UDID': 'AFDCAS2ZgwyPTmHKenJ488LtpT5Eu0sVI_o=',
 83 |     }
 84 | 
 85 |     search_api_payload = {
 86 |         't': 'general',
 87 |         'correction': '1',
 88 |         'limit': '10',
 89 |         'q': 'python',
 90 |         'search_hash_id': '0ca5c03842318b3fdb51cfc4c11340e9',
 91 |         'offset': '0'
 92 |     }
 93 | 
 94 |     comment_api_headers = {
 95 |         'accept': 'application/json, text/plain, */*',
 96 |         'Accept-Language': 'zh-CN,zh;q=0.9',
 97 |         'Host': 'www.zhihu.com',
 98 |         'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.146 Safari/537.36',
 99 |     }
100 | 
101 |     comment_api_payload = {
102 |         'sort_by': 'default',
103 |         'include': 'data[*].is_normal,admin_closed_comment,reward_info,is_collapsed,annotation_action,annotation_detail,collapse_reason,is_sticky,collapsed_by,suggest_edit,comment_count,can_comment,content,editable_content,voteup_count,reshipment_settings,comment_permission,created_time,updated_time,review_info,relevant_info,question,excerpt,relationship.is_authorized,is_author,voting,is_thanked,is_nothelp,upvoted_followees;data[*].mark_infos[*].url;data[*].author.follower_count,badge[?(type=best_answerer)].topics',
104 |         'limit': '5',
105 |         'offset': '5',
106 |     }
107 | 
108 |     def __init__(self, username, password):
109 |         self.username, self.password = username, password
110 |         self.session = requests.session()
111 | 
112 |     def get_token(self):
113 |         resp = self.session.get(self.sigup_url, headers=self.headers_sigup, allow_redirects=False)
114 |         return resp.cookies['_xsrf']
115 | 
116 |     def get_captcha(self):
117 |         captcha_url = 'https://www.zhihu.com/api/v3/oauth/captcha'
118 |         query_string_parameters = {'lang': 'en'}
119 |         resp = self.session.get(captcha_url, data=query_string_parameters, headers=self.headers_captcha)
120 | 
121 |         if json.loads(resp.text)['show_captcha']:
122 |             resp = self.session.put(captcha_url, data=query_string_parameters, headers=self.headers_captcha)
123 |             print resp.content
124 |             img_data = base64.b64decode(resp.json()['img_base64'])
125 |             with open('captcha_zhihu.png', 'wb') as f:
126 |                 f.write(img_data)
127 | 
128 |             # 打开验证码
129 |             image = Image.open('captcha_zhihu.png')
130 |             image.show()
131 | 
132 |             captcha = raw_input(u'请输入验证码：')
133 |             return captcha
134 |         else:
135 |             return None
136 | 
137 |     def get_signature(self, timestamp):
138 |         h = hmac.new(b'd1b964811afb40118a12068ff74a12f4', digestmod=hashlib.sha1)
139 |         grant_type = self.login_payload['grant_type']
140 |         source = self.login_payload['source']
141 |         h.update(grant_type + self.client_id + source + timestamp)
142 |         return h.hexdigest()
143 | 
144 |     def check_login(self):
145 |         resp = self.session.get(self.sigup_url, allow_redirects=True, headers=self.simple_headers)
146 |         if resp.url == self.home_url:
147 |             return True
148 | 
149 |     def login(self):
150 |         xsrf_token = self.get_token()
151 |         timestamp = str(int(time.time() * 1000))
152 |         signature = self.get_signature(timestamp)
153 |         captcha = self.get_captcha()
154 |         self.login_payload.update({
155 |             'username': self.username,
156 |             'password': self.password,
157 |             'timestamp': timestamp,
158 |             'signature': signature,
159 |             'captcha': captcha,
160 |         })
161 |         self.headers_sigin.update({'X-Xsrftoken': xsrf_token})
162 |         resp = self.session.post(self.sigin_url, data=self.login_payload, headers=self.headers_sigin,
163 |                                  allow_redirects=False)
164 |         check = self.check_login()
165 |         if 'error' in resp.text:
166 |             print resp.text
167 |         elif check:
168 |             print u'登陆成功！'
169 | 
170 |     @staticmethod
171 |     def html_tags_eraser(htmls):
172 |         htmls = str(htmls)
173 |         pattern = re.compile(r'<[^>]+>', re.S)
174 |         return pattern.sub('', htmls).replace('|', ' ').replace('\r', '').replace('\n', '')
175 | 
176 |     def search_questions(self, keyword):
177 |         # 获取search_hash_id
178 |         search_payload = {
179 |             'type': 'content',
180 |             'q': keyword
181 |         }
182 |         resp = self.session.get(url=self.search_url, params=search_payload, headers=self.simple_headers)
183 |         referer = resp.url
184 | 
185 |         hash_id = None
186 |         hash_id_pattern = "search_hash_id=([\d\w]+)"
187 |         result = re.search(hash_id_pattern, resp.content)
188 |         if result:
189 |             hash_id = result.group(1)
190 | 
191 |         # 从cookies获取x-uuid和authorization
192 |         self.session.get(url=self.search_url, params=search_payload, headers=self.simple_headers)
193 |         print self.session.cookies
194 |         x_uuid = self.session.cookies['d_c0'].split('|')[0].replace('"', '')
195 |         authorization = 'Bearer ' + self.session.cookies['z_c0'].replace('"', '')
196 | 
197 |         self.search_api_headers.update({'authorization': authorization, 'Referer': referer, 'X-UDID': x_uuid})
198 |         print self.search_api_headers
199 | 
200 |         questions = []
201 |         offset = 0
202 |         limit = 10
203 |         while True:
204 |             print 'Getting search result {} - {}'.format(offset, offset + limit)
205 |             self.search_api_payload.update({'q': keyword, 'search_hash_id': hash_id,
206 |                                             'offset': str(offset), 'limit': str(limit)})
207 |             resp = self.session.get(self.search_api_url, params=self.search_api_payload,
208 |                                     headers=self.search_api_headers)
209 |             json_data = resp.json()
210 | 
211 |             is_end = json_data['paging']['is_end']
212 |             items = json_data['data']
213 |             # 针对question做特殊过滤
214 |             questions.extend([item['object']['question']['url'].split('/')[-1] for item in items
215 |                               if 'object' in item and 'question' in item['object']])
216 | 
217 |             if is_end:
218 |                 break
219 |             else:
220 |                 offset += limit
221 | 
222 |         return questions
223 | 
224 |     def get_comments(self, question_ids):
225 |         # 构建输出文件
226 |         filename = 'zhihu-%s-comments.csv' % (datetime.datetime.now().strftime('%Y%m%d-%H%M'))
227 |         # 写入表头
228 |         with codecs.open(filename, 'w', 'utf_8_sig') as f:
229 |             f.write(u'评论id|评论url|评论内容|评论创建时间|评论更新时间|作者|作者url|作者id|问题id|问题标题'
230 |                     u'|问题url|问题创建时间|问题更新时间\r\n')
231 | 
232 |         limit = 5
233 | 
234 |         for question_id in question_ids:
235 |             print 'Processing Question {}'.format(question_id)
236 |             question_page_url = self.question_page_url + question_id
237 |             comment_api_url = self.comment_api_url.format(question_id=question_id)
238 |             print comment_api_url
239 | 
240 |             # 从cookies获取x-uuid和authorization
241 |             self.session.get(url=question_page_url, headers=self.simple_headers)
242 |             print self.session.cookies
243 |             x_uuid = self.session.cookies['d_c0'].split('|')[0].replace('"', '')
244 |             authorization = 'Bearer ' + self.session.cookies['z_c0'].replace('"', '')
245 |             # 组装请求头
246 |             self.comment_api_headers.update({'authorization': authorization, 'Referer': question_page_url,
247 |                                              'X-UDID': x_uuid})
248 |             print self.comment_api_headers
249 | 
250 |             offset = 0
251 |             while True:
252 |                 print 'Getting comment result {} - {}'.format(offset, offset + limit)
253 |                 self.comment_api_payload.update({'limit': str(limit), 'offset': str(offset)})
254 |                 resp = self.session.get(comment_api_url, params=self.comment_api_payload,
255 |                                         headers=self.comment_api_headers)
256 |                 print resp.text
257 |                 json_data = resp.json()
258 | 
259 |                 is_end = json_data['paging']['is_end']
260 |                 items = json_data['data']
261 | 
262 |                 for item in items:
263 |                     created_time = item['created_time']
264 |                     updated_time = item['updated_time']
265 |                     id_ = item['id']
266 |                     url = item['url']
267 |                     content = item['content']
268 | 
269 |                     author_name = item['author']['name']
270 |                     author_url = item['author']['url']
271 |                     author_id = item['author']['id']
272 | 
273 |                     question_title = item['question']['title']
274 |                     question_url = item['question']['url']
275 |                     question_created = item['question']['created']
276 |                     question_updated = item['question']['updated_time']
277 | 
278 |                     with codecs.open(filename, 'a', 'utf_8_sig') as f:
279 |                         content = '|'.join(
280 |                             map(self.html_tags_eraser, (id_, url, content, created_time, updated_time, author_name,
281 |                                                         author_url, author_id, question_id, question_title,
282 |                                                         question_url,
283 |                                                         question_created, question_updated))) + '\r\n'
284 |                         print content
285 |                         f.write(content)
286 | 
287 |                 if is_end:
288 |                     break
289 |                 offset += limit
290 | 
291 | 
292 | if __name__ == '__main__':
293 |     zhihu = Zhihu('username', 'password')
294 |     zhihu.login()
295 |     question_ids = zhihu.search_questions('python')
296 |     print len(question_ids)
297 |     print question_ids
298 |     zhihu.get_comments(question_ids)
299 | 


--------------------------------------------------------------------------------
/WeiboCommentSpider/weibocomment.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | # @Author : Huangcc
  3 | 
  4 | import urllib
  5 | import base64
  6 | import rsa
  7 | import binascii
  8 | import requests
  9 | import time
 10 | import json
 11 | import datetime
 12 | import codecs
 13 | import re
 14 | from collections import defaultdict
 15 | 
 16 | MAX_INT = 999999
 17 | 
 18 | 
 19 | class Weibo():
 20 |     def __init__(self, user, passwd):
 21 |         self.username = user
 22 |         self.password = passwd
 23 | 
 24 |         self.get_login_params_url = 'https://login.sina.com.cn/sso/prelogin.php'
 25 |         self.get_login_params_headers = {
 26 |             'Host': 'login.sina.com.cn',
 27 |             'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64; rv:52.0) Gecko/20100101 Firefox/52.0',
 28 |             'Accept': '*/*',
 29 |             'Accept-Language': 'zh-CN,zh;q=0.8,en-US;q=0.5,en;q=0.3',
 30 |             'Accept-Encoding': 'gzip, deflate, br',
 31 |             'Referer': 'https://weibo.com/login.php',
 32 |             'Connection': 'keep-alive',
 33 |         }
 34 |         self.get_login_params_data = {
 35 |             'entry': 'weibo',
 36 |             'callback': 'sinaSSOController.preloginCallBack',
 37 |             'su': '',
 38 |             'rsakt': 'mod',
 39 |             'client': 'ssologin.js(v1.4.19)',
 40 |             '_': '1510196975574',  # 毫秒级别的时间戳
 41 |         }
 42 | 
 43 |         self.login_params = {
 44 |             'entry': 'weibo',
 45 |             'gateway': '1',
 46 |             'from': '',
 47 |             'savestate': '7',
 48 |             'qrcode_flag': 'false',
 49 |             'useticket': '1',
 50 |             'pagerefer': '',
 51 |             'vsnf': '1',
 52 |             # 'su': 'MTUxMTk5MDQ5NzElNDBzaW5hLmNu',
 53 |             'service': 'miniblog',
 54 |             # 'servertime': '1510195867',
 55 |             # 'nonce': 'P6ZMJ7',
 56 |             'pwencode': 'rsa2',
 57 |             # 'rsakv': '1330428213',
 58 |             # 'sp': '',
 59 |             'sr': '1920*1080',
 60 |             'encoding': 'UTF-8',
 61 |             'prelt': '210',
 62 |             'url': 'https://weibo.com/ajaxlogin.php?framelogin=1&callback=parent.sinaSSOController.feedBackUrlCallBack',
 63 |             'returntype': 'META',
 64 |         }
 65 |         self.login_url = 'https://login.sina.com.cn/sso/login.php?client=ssologin.js(v1.4.19)'
 66 |         self.login_headers = {
 67 |             'Host': 'login.sina.com.cn',
 68 |             'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64; rv:52.0) Gecko/20100101 Firefox/52.0',
 69 |             'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
 70 |             'Accept-Language': 'zh-CN,zh;q=0.8,en-US;q=0.5,en;q=0.3',
 71 |             'Accept-Encoding': 'gzip, deflate, br',
 72 |             'Referer': 'https://weibo.com/login.php',
 73 |             'Connection': 'keep-alive',
 74 |             'Upgrade-Insecure-Requests': '1'
 75 |         }
 76 | 
 77 |         self.search_headers = {
 78 |             'Host': 'm.weibo.cn',
 79 |             'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64; rv:52.0) Gecko/20100101 Firefox/52.0',
 80 |             'Accept': 'application/json, text/plain, */*',
 81 |             'Accept-Language': 'zh-CN,zh;q=0.8,en-US;q=0.5,en;q=0.3',
 82 |             'Accept-Encoding': 'gzip, deflate, br',
 83 |             'X-Requested-With': 'XMLHttpRequest',
 84 |             'Referer': '',
 85 |         }
 86 |         self.search_referer = 'https://m.weibo.cn/p/100103type%3D2%26q%3D{keyword}?type=wb&queryVal={keyword}' \
 87 |                               '&featurecode=20000320&luicode=10000011&lfid=106003type%3D1&title={keyword}'
 88 |         self.search_url = 'https://m.weibo.cn/api/container/getIndex'
 89 |         self.search_data = {
 90 |             'type': 'wb',
 91 |             'queryVal': '{keyword}',
 92 |             'featurecode': '20000320',
 93 |             'luicode': '10000011',
 94 |             'lfid': '106003type=1',
 95 |             'title': '{keyword}',
 96 |             'containerid': '100103type=2&q={keyword}',
 97 |         }
 98 | 
 99 |         self.comment_headers = {
100 |             'Host': 'm.weibo.cn',
101 |             'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64; rv:52.0) Gecko/20100101 Firefox/52.0',
102 |             'Accept': 'application/json, text/plain, */*',
103 |             'Accept-Language': 'zh-CN,zh;q=0.8,en-US;q=0.5,en;q=0.3',
104 |             'Accept-Encoding': 'gzip, deflate, br',
105 |             'X-Requested-With': 'XMLHttpRequest',
106 |             # 'Referer': 'https://m.weibo.cn/status/4172328575544621',
107 |             'Connection': 'keep-alive',
108 |         }
109 |         self.comment_referer = 'https://m.weibo.cn/status/{weibo_id}'
110 |         self.comment_url = 'https://m.weibo.cn/api/comments/show'
111 |         self.comment_data = {
112 |             'id': '{id}',
113 |             'page': '{page}',
114 |         }
115 | 
116 |         self.session = requests.session()
117 |         self.servertime = None
118 |         self.nonce = None
119 |         self.pubkey = None
120 |         self.rsakv = None
121 |         self.keyword = None
122 |         self.max_try = 5
123 |         self.weibo_ids = []
124 |         self.csv_seq = '|'
125 | 
126 |     def get_b64_username(self):
127 |         username = urllib.quote(self.username)
128 |         username = base64.encodestring(username)[:-1]
129 |         return username
130 | 
131 |     def get_rsa_password(self, pubkey, nonce, server_time):
132 |         # var RSAKey = new sinaSSOEncoder.RSAKey();
133 |         # RSAKey.setPublic(me.rsaPubkey, "10001");
134 |         # password = RSAKey.encrypt([me.servertime, me.nonce].join("\t") + "\n" + password)
135 |         rsaPublickey = int(pubkey, 16)
136 |         key = rsa.PublicKey(rsaPublickey, int("10001", 16))  # 创建公钥
137 |         message = str(server_time) + '\t' + str(nonce) + '\n' + str(self.password)  # 拼接明文js加密文件中得到
138 |         passwd = rsa.encrypt(message, key)  # 加密
139 |         passwd = binascii.b2a_hex(passwd)  # 将加密信息转换为16进制
140 |         return passwd
141 | 
142 |     def get_login_params(self):
143 |         self.get_login_params_data['_'] = int(1000 * time.time())
144 |         resp = self.session.get(self.get_login_params_url, params=self.get_login_params_data,
145 |                                 headers=self.get_login_params_headers)
146 |         raw_data = resp.content
147 |         raw_data = raw_data[len('sinaSSOController.preloginCallBack('):-1]
148 |         json_data = json.loads(raw_data)
149 |         return json_data['servertime'], json_data['nonce'], json_data['pubkey'], json_data['rsakv']
150 | 
151 |     def set_login_params(self, servertime, nonce, pubkey, rsakv):
152 |         self.servertime, self.nonce, self.pubkey, self.rsakv = servertime, nonce, pubkey, rsakv
153 | 
154 |     def login(self):
155 |         user = self.get_b64_username()
156 |         passwd = self.get_rsa_password(self.pubkey, self.nonce, self.servertime)
157 |         self.login_params['su'] = user
158 |         self.login_params['servertime'] = self.servertime
159 |         self.login_params['nonce'] = self.nonce
160 |         self.login_params['rsakv'] = self.rsakv
161 |         self.login_params['sp'] = passwd
162 | 
163 |         resp = self.session.post(self.login_url, data=self.login_params, headers=self.login_headers)
164 |         if 'retcode%3D0' in resp.content:
165 |             print 'login success'
166 |             return True
167 |         print 'login fail'
168 |         print resp.content
169 |         return False
170 | 
171 |     def set_search_keyword(self, keyword):
172 |         self.keyword = keyword
173 |         self.search_headers['Referer'] = self.search_referer.format(keyword=urllib.quote(self.keyword))
174 |         self.search_data['queryVal'] = self.search_data['queryVal'].format(keyword=self.keyword)
175 |         self.search_data['title'] = self.search_data['title'].format(keyword=self.keyword)
176 |         self.search_data['containerid'] = self.search_data['containerid'].format(keyword=self.keyword)
177 | 
178 |     def parse_weibo_response(self, json_data):
179 |         mids = []
180 |         if json_data['cards']:
181 |             cards = json_data['cards'][0]['card_group']
182 |             for card in cards:
183 |                 mblog = card['mblog']
184 |                 mid = mblog['mid']  # 微博id
185 |                 comments_count = mblog['comments_count']  # 评论数量
186 |                 if comments_count:
187 |                     mids.append(mid)
188 |                     # text = mblog['text']  # 微博内容，html形式
189 |                     # user = mblog['user']  # 发布者
190 |                     # user_id = user['id']
191 |                     # user_name = user['screen_name']
192 |                     # user_desc = user['description']
193 |         return mids
194 | 
195 |     def set_max_try(self, n):
196 |         self.max_try = n
197 | 
198 |     def get_weibo_item_ids(self):
199 |         mids = []
200 |         is_end = False
201 |         page_try = defaultdict(lambda: 0)
202 |         page = 1
203 |         while not is_end:
204 |             self.search_data['page'] = str(page)
205 |             page_try[page] += 1
206 |             if page_try[page] > self.max_try:
207 |                 print 'page %d is more than max tries %d' % (page, self.max_try)
208 |                 page += 1
209 | 
210 |             resp = requests.get(self.search_url, params=self.search_data, headers=self.search_headers)
211 |             json_data = {'ok': 0}
212 |             try:
213 |                 json_data = resp.json()
214 |             except:
215 |                 print resp.content
216 |             if json_data['ok']:
217 |                 print 'start page %d' % page
218 |                 mids.extend(self.parse_weibo_response(json_data))  # 获取所有带评论的微博id
219 |                 if not json_data['cardlistInfo']['page']:
220 |                     is_end = True
221 |                 else:
222 |                     page += 1
223 |             else:
224 |                 print 'page %d not ok' % page
225 |         print 'getting weibo search result is to be end: %d' % len(mids)
226 |         return mids
227 | 
228 |     def set_weibo_ids(self, ids):
229 |         self.weibo_ids = ids
230 | 
231 |     def html_filter(self, html_text):
232 |         pattern = re.compile(r'<[^>]+>', re.S)
233 |         no_html_text = pattern.sub('', html_text)
234 |         return no_html_text
235 | 
236 |     def parse_comment_data(self, json_data):
237 |         comments = []
238 |         if 'data' in json_data and json_data['data']:
239 |             for comment in json_data['data']:
240 |                 comment_id = unicode(comment['id'])
241 |                 user_id = unicode(comment['user']['id'])
242 |                 user_name = unicode(comment['user']['screen_name'])
243 |                 comment_content = unicode(comment['text'])
244 |                 comment_content = self.html_filter(comment_content)
245 |                 comments.append((comment_id, user_id, user_name, comment_content))
246 |         return comments
247 | 
248 |     def curl_comments(self, filename=None):
249 |         # 创建文件并写入表头
250 |         if not filename:
251 |             filename = 'weibo-comments-%s.csv' % (datetime.datetime.now().strftime('%Y%m%d-%H%M'))
252 |         with codecs.open(filename, 'a', 'utf-8') as f:
253 |             f.write(self.csv_seq.join((u'评论id', u'用户id', u'用户名', u'评论')) + '\n')
254 | 
255 |         current_id_pos = 1
256 |         max_id_pos = len(self.weibo_ids)
257 |         for id_ in self.weibo_ids:
258 |             print '-*- start weibo page %d/%d -*-' % (current_id_pos, max_id_pos)
259 |             current_id_pos += 1
260 | 
261 |             # 初始化评论页迭代参数
262 |             page = 1
263 |             page_try = defaultdict(lambda: 0)
264 |             comment_headers = self.comment_headers.copy()
265 |             comment_headers['Referer'] = self.comment_referer.format(weibo_id=id_)
266 |             comment_data = self.comment_data.copy()
267 |             comment_data['id'] = id_
268 |             max_page = MAX_INT
269 |             # 抓取评论页
270 |             while page <= max_page:
271 |                 # 控制尝试次数，超出最大尝试次数则退出
272 |                 page_try[page] += 1
273 |                 if page_try[page] > self.max_try:
274 |                     print 'comment page %d/%d is more than max tries %d' % (page, max_page, self.max_try)
275 |                     page += 1
276 |                     # 超出最大页数则退出评论爬取
277 |                     if page > max_page:
278 |                         print 'page %d is more than max_page %d' % (page, max_page)
279 |                         break
280 |                     # 无法获取最大页数则退出评论爬取
281 |                     if page > 1 and max_page == MAX_INT:
282 |                         print 'No next page %d' % page
283 |                         break
284 | 
285 |                 # 开始获取评论
286 |                 comment_data['page'] = str(page)
287 |                 resp = self.session.get(self.comment_url, params=comment_data, headers=comment_headers)
288 |                 json_data = {'ok': 0}
289 |                 try:
290 |                     json_data = resp.json()
291 |                 except ValueError:
292 |                     # 返回不正确则直接退出评论抓取（接口限制）
293 |                     break
294 |                 if json_data['ok']:
295 |                     print 'start comment page %d/%d' % (page, max_page)
296 |                     max_page = min(json_data['max'], max_page)  # 最大评论页数
297 |                     comments = self.parse_comment_data(json_data)  # 获取当前评论页评论
298 |                     with codecs.open(filename, 'a', 'utf-8') as f:
299 |                         for comment in comments:
300 |                             f.write(self.csv_seq.join(comment) + '\n')
301 | 
302 |                     # 页数控制
303 |                     page += 1
304 |                 else:
305 |                     print 'comment page %d/%d not ok' % (page, max_page)
306 | 
307 | 
308 | if __name__ == '__main__':
309 |     weibo = Weibo('your_weibo_username', 'your_weibo_password')
310 |     servertime, nonce, pubkey, rsakv = weibo.get_login_params()
311 |     weibo.set_login_params(servertime, nonce, pubkey, rsakv)
312 |     if weibo.login():
313 |         weibo.set_search_keyword('冯提莫')
314 |         ids = weibo.get_weibo_item_ids()
315 |         weibo.set_weibo_ids(ids)
316 |         weibo.curl_comments()
317 |     else:
318 |         print 'Make sure that your Weibo username and password are right'
319 | 


--------------------------------------------------------------------------------
/WeiboCommentSpider/weibo_comment_result_example.json:
--------------------------------------------------------------------------------
  1 | {
  2 |   "ok": 1,
  3 |   "msg": "\u6570\u636e\u83b7\u53d6\u6210\u529f",
  4 |   "data": [
  5 |     {
  6 |       "id": 4172544283687415,
  7 |       "created_at": "1\u5206\u949f\u524d",
  8 |       "source": "\u9b45\u65cf PRO 5",
  9 |       "user": {
 10 |         "id": 2136636343,
 11 |         "screen_name": "\u7af9\u897fC",
 12 |         "profile_image_url": "https:\/\/tva3.sinaimg.cn\/crop.0.19.233.233.180\/7f5a7bb7jw1ealqnreu9oj206j08bmxf.jpg",
 13 |         "verified": false,
 14 |         "verified_type": -1,
 15 |         "mbtype": 0,
 16 |         "profile_url": "https:\/\/m.weibo.cn\/u\/2136636343?uid=2136636343",
 17 |         "remark": ""
 18 |       },
 19 |       "text": "\u4e0d\u662f\u840c\u5c0f\u561b<span class=\"url-icon\"><img src=\"\/\/h5.sinaimg.cn\/m\/emoticon\/icon\/default\/d_yunbei-c6964bf237.png\" style=\"width:1em;height:1em;\" alt=\"[\u5141\u60b2]\"><\/span><span class=\"url-icon\"><img src=\"\/\/h5.sinaimg.cn\/m\/emoticon\/icon\/default\/d_yunbei-c6964bf237.png\" style=\"width:1em;height:1em;\" alt=\"[\u5141\u60b2]\"><\/span><span class=\"url-icon\"><img src=\"\/\/h5.sinaimg.cn\/m\/emoticon\/icon\/default\/d_yunbei-c6964bf237.png\" style=\"width:1em;height:1em;\" alt=\"[\u5141\u60b2]\"><\/span>",
 20 |       "like_counts": 0,
 21 |       "liked": false
 22 |     },
 23 |     {
 24 |       "id": 4172544233718438,
 25 |       "created_at": "1\u5206\u949f\u524d",
 26 |       "source": "\u5c0f\u7c73Max2 \u5927\u5c4f\u5927\u7535\u91cf",
 27 |       "user": {
 28 |         "id": 5322013701,
 29 |         "screen_name": "JANUARYFEBRUARYMARCHAPRIL",
 30 |         "profile_image_url": "https:\/\/tvax2.sinaimg.cn\/crop.0.0.1002.1002.180\/005OaCSFly8flaxg6qj56j30ru0ru7ae.jpg",
 31 |         "verified": false,
 32 |         "verified_type": -1,
 33 |         "mbtype": 2,
 34 |         "profile_url": "https:\/\/m.weibo.cn\/u\/5322013701?uid=5322013701",
 35 |         "remark": ""
 36 |       },
 37 |       "text": "\u56de\u590d<a href='https:\/\/m.weibo.cn\/n\/\u61d2\u7c73\u7c738711'>@\u61d2\u7c73\u7c738711<\/a>:\u770b\u540e\u8fb9\u5199\u7684\u5b57\u513f<span class=\"url-icon\"><img src=\"\/\/h5.sinaimg.cn\/m\/emoticon\/icon\/default\/d_xiaoku-7430606cb7.png\" style=\"width:1em;height:1em;\" alt=\"[\u7b11cry]\"><\/span>",
 38 |       "reply_id": 4172543659142497,
 39 |       "reply_text": "\u56de\u590d<a href='https:\/\/m.weibo.cn\/n\/iPanda\u718a\u732b\u9891\u9053'>@iPanda\u718a\u732b\u9891\u9053<\/a>:\u5976\u7238\u8bf4\u662f\u840c\u5c0f",
 40 |       "like_counts": 0,
 41 |       "liked": false
 42 |     },
 43 |     {
 44 |       "id": 4172544187200590,
 45 |       "created_at": "1\u5206\u949f\u524d",
 46 |       "source": "iPhone\u5ba2\u6237\u7aef",
 47 |       "user": {
 48 |         "id": 1832833387,
 49 |         "screen_name": "sevenlife",
 50 |         "profile_image_url": "https:\/\/tvax4.sinaimg.cn\/crop.0.0.512.512.180\/6d3ed16bly8fe87n2w0x7j20e80e8mx8.jpg",
 51 |         "verified": false,
 52 |         "verified_type": -1,
 53 |         "mbtype": 0,
 54 |         "profile_url": "https:\/\/m.weibo.cn\/u\/1832833387?uid=1832833387",
 55 |         "remark": ""
 56 |       },
 57 |       "text": "\u8fd9\u7167\u7247\u662f\u4e0d\u662f\u6709\u70b9\u50cf\u7834\u4ea7\u59d0\u59b9\u91cc\u7684\u90a3\u8c01",
 58 |       "like_counts": 0,
 59 |       "liked": false
 60 |     },
 61 |     {
 62 |       "id": 4172544179133133,
 63 |       "created_at": "1\u5206\u949f\u524d",
 64 |       "source": "\u5c0f\u7c735X \u62cd\u4eba\u66f4\u7f8e",
 65 |       "user": {
 66 |         "id": 3225859821,
 67 |         "screen_name": "\u5403\u751c\u7684\u767d\u65e5\u68a6",
 68 |         "profile_image_url": "https:\/\/tva3.sinaimg.cn\/crop.0.0.664.664.180\/c046b6edjw8ezwhq0q53bj20ig0igq3y.jpg",
 69 |         "verified": false,
 70 |         "verified_type": -1,
 71 |         "mbtype": 11,
 72 |         "profile_url": "https:\/\/m.weibo.cn\/u\/3225859821?uid=3225859821",
 73 |         "remark": ""
 74 |       },
 75 |       "text": "\u6709\u4e2a\u5c0f\u59d1\u5a18\u7167\u5230\u95ed\u773c\u775b\u4e86\uff0c\u540c\u60c5",
 76 |       "like_counts": 0,
 77 |       "liked": false
 78 |     },
 79 |     {
 80 |       "id": 4172543885430014,
 81 |       "created_at": "2\u5206\u949f\u524d",
 82 |       "source": "\u5c0f\u7c73Max",
 83 |       "user": {
 84 |         "id": 6270899000,
 85 |         "screen_name": "\u601d\u5ff5Jill",
 86 |         "profile_image_url": "https:\/\/tvax1.sinaimg.cn\/crop.0.0.1002.1002.180\/006Qo3igly8fgceblxh2hj30ru0rugnt.jpg",
 87 |         "verified": false,
 88 |         "verified_type": -1,
 89 |         "mbtype": 11,
 90 |         "profile_url": "https:\/\/m.weibo.cn\/u\/6270899000?uid=6270899000",
 91 |         "remark": ""
 92 |       },
 93 |       "text": "\u56de\u590d<a href='https:\/\/m.weibo.cn\/n\/iPanda\u718a\u732b\u9891\u9053'>@iPanda\u718a\u732b\u9891\u9053<\/a>:\u54c8\u54c8\uff0c\u559c\u95fb\u4e50\u89c1\u7ffb\u8f66\u73b0\u573a<span class=\"url-icon\"><img src=\"\/\/h5.sinaimg.cn\/m\/emoticon\/icon\/default\/d_heiheihei-c1027e7c02.png\" style=\"width:1em;height:1em;\" alt=\"[\u7b11\u800c\u4e0d\u8bed]\"><\/span>",
 94 |       "reply_id": 4172543135276210,
 95 |       "reply_text": "\u539f\u6765\u662f\u5927\u718a\u732b\u201c\u53e4\u53e4\u201d\u201c\u63a5\u89c1\u201d\u7684\u7279\u6717\u666e\u592b\u4eba\u554a~<span class=\"url-icon\"><img src=\"\/\/h5.sinaimg.cn\/m\/emoticon\/icon\/default\/d_haha-bdd6ceb619.png\" style=\"width:1em;height:1em;\" alt=\"[\u54c8\u54c8]\"><\/span><span class=\"url-icon\"><img src=\"\/\/h5.sinaimg.cn\/m\/emoticon\/icon\/default\/d_haha-bdd6ceb619.png\" style=\"width:1em;height:1em;\" alt=\"[\u54c8\u54c8]\"><\/span><span class=\"url-icon\"><img src=\"\/\/h5.sinaimg.cn\/m\/emoticon\/icon\/default\/d_haha-bdd6ceb619.png\" style=\"width:1em;height:1em;\" alt=\"[\u54c8\u54c8]\"><\/span>",
 96 |       "like_counts": 0,
 97 |       "liked": false
 98 |     },
 99 |     {
100 |       "id": 4172543823262436,
101 |       "created_at": "3\u5206\u949f\u524d",
102 |       "source": "Android",
103 |       "user": {
104 |         "id": 3099343157,
105 |         "screen_name": "\u6770\u6770ber",
106 |         "profile_image_url": "https:\/\/tvax2.sinaimg.cn\/crop.0.0.960.960.180\/b8bc3935ly8fht7jimi3cj20qo0qojtw.jpg",
107 |         "verified": false,
108 |         "verified_type": -1,
109 |         "mbtype": 2,
110 |         "profile_url": "https:\/\/m.weibo.cn\/u\/3099343157?uid=3099343157",
111 |         "remark": ""
112 |       },
113 |       "text": "\u5b57\u6709\u70b9\u4f24\u5440",
114 |       "like_counts": 0,
115 |       "liked": false
116 |     },
117 |     {
118 |       "id": 4172543659142497,
119 |       "created_at": "3\u5206\u949f\u524d",
120 |       "source": "\u5356\u840c\u5170\u6362\u7684Android",
121 |       "user": {
122 |         "id": 3274546055,
123 |         "screen_name": "\u61d2\u7c73\u7c738711",
124 |         "profile_image_url": "https:\/\/tvax2.sinaimg.cn\/crop.0.0.996.996.180\/c32d9b87ly8feg3rclq14j20ro0ron0t.jpg",
125 |         "verified": false,
126 |         "verified_type": -1,
127 |         "mbtype": 11,
128 |         "profile_url": "https:\/\/m.weibo.cn\/u\/3274546055?uid=3274546055",
129 |         "remark": ""
130 |       },
131 |       "text": "\u56de\u590d<a href='https:\/\/m.weibo.cn\/n\/iPanda\u718a\u732b\u9891\u9053'>@iPanda\u718a\u732b\u9891\u9053<\/a>:\u5976\u7238\u8bf4\u662f\u840c\u5c0f",
132 |       "reply_id": 4172543135276210,
133 |       "reply_text": "\u539f\u6765\u662f\u5927\u718a\u732b\u201c\u53e4\u53e4\u201d\u201c\u63a5\u89c1\u201d\u7684\u7279\u6717\u666e\u592b\u4eba\u554a~<span class=\"url-icon\"><img src=\"\/\/h5.sinaimg.cn\/m\/emoticon\/icon\/default\/d_haha-bdd6ceb619.png\" style=\"width:1em;height:1em;\" alt=\"[\u54c8\u54c8]\"><\/span><span class=\"url-icon\"><img src=\"\/\/h5.sinaimg.cn\/m\/emoticon\/icon\/default\/d_haha-bdd6ceb619.png\" style=\"width:1em;height:1em;\" alt=\"[\u54c8\u54c8]\"><\/span><span class=\"url-icon\"><img src=\"\/\/h5.sinaimg.cn\/m\/emoticon\/icon\/default\/d_haha-bdd6ceb619.png\" style=\"width:1em;height:1em;\" alt=\"[\u54c8\u54c8]\"><\/span>",
134 |       "like_counts": 0,
135 |       "liked": false
136 |     },
137 |     {
138 |       "id": 4172543521028345,
139 |       "created_at": "4\u5206\u949f\u524d",
140 |       "source": "OPPO R9",
141 |       "user": {
142 |         "id": 2605943347,
143 |         "screen_name": "\u732b\u4e0e\u69b4\u83b2\u7530baocl",
144 |         "profile_image_url": "https:\/\/tvax4.sinaimg.cn\/crop.0.0.996.996.180\/9b538a33ly8fl7mj42xsqj20ro0rodi7.jpg",
145 |         "verified": false,
146 |         "verified_type": -1,
147 |         "mbtype": 0,
148 |         "profile_url": "https:\/\/m.weibo.cn\/u\/2605943347?uid=2605943347",
149 |         "remark": ""
150 |       },
151 |       "text": "\u56fe\u4e09\u6211\u60f3\u8981\u539f\u56fe<span class=\"url-icon\"><img src=\"\/\/h5.sinaimg.cn\/m\/emoticon\/icon\/others\/d_miao-9ebe15b2c2.png\" style=\"width:1em;height:1em;\" alt=\"[\u55b5\u55b5]\"><\/span>",
152 |       "like_counts": 0,
153 |       "liked": false
154 |     },
155 |     {
156 |       "id": 4172543504466751,
157 |       "created_at": "4\u5206\u949f\u524d",
158 |       "source": "\u5fae\u535a weibo.com",
159 |       "user": {
160 |         "id": 2393112124,
161 |         "screen_name": "Iamnotafraid",
162 |         "profile_image_url": "https:\/\/tva2.sinaimg.cn\/crop.0.0.180.180.180\/8ea3fe3cjw1e8qgp5bmzyj2050050aa8.jpg",
163 |         "verified": false,
164 |         "verified_type": -1,
165 |         "mbtype": 0,
166 |         "profile_url": "https:\/\/m.weibo.cn\/u\/2393112124?uid=2393112124",
167 |         "remark": ""
168 |       },
169 |       "text": "\u8fd9\u4e9b\u5b69\u5b50 \u90fd\u4e0d\u7528\u53bb\u4e0a\u5b66\u5417",
170 |       "like_counts": 0,
171 |       "liked": false
172 |     },
173 |     {
174 |       "id": 4172543478523580,
175 |       "created_at": "4\u5206\u949f\u524d",
176 |       "source": "iPhone\u5ba2\u6237\u7aef",
177 |       "user": {
178 |         "id": 3920727637,
179 |         "screen_name": "\u6211\u53d6\u4ec0\u4e48\u540d\u5b57\u5173\u4f60\u5c41\u4e8b",
180 |         "profile_image_url": "https:\/\/tvax1.sinaimg.cn\/crop.0.7.497.497.180\/e9b18e55ly8fj2a06llbvj20dt0e8wf1.jpg",
181 |         "verified": false,
182 |         "verified_type": -1,
183 |         "mbtype": 2,
184 |         "profile_url": "https:\/\/m.weibo.cn\/u\/3920727637?uid=3920727637",
185 |         "remark": ""
186 |       },
187 |       "text": "\u7ec8\u4e8e\u7b11\u4e86\uff0c\u8fd8\u662f\u6eda\u6eda\u5389\u5bb3\ud83d\udc3c",
188 |       "like_counts": 0,
189 |       "liked": false
190 |     }
191 |   ],
192 |   "total_number": 27,
193 |   "max": 3,
194 |   "hot_data": [
195 |     {
196 |       "id": 4172538495654634,
197 |       "created_at": "24\u5206\u949f\u524d",
198 |       "source": "\u5fae\u535a weibo.com",
199 |       "user": {
200 |         "id": 5838706073,
201 |         "screen_name": "\u978b\u5382-\u6279\u53d1\u603b\u5e97",
202 |         "profile_image_url": "https:\/\/tvax2.sinaimg.cn\/crop.204.174.600.600.180\/006n8CaBly8flctk2fzfbj30u00qoq6f.jpg",
203 |         "verified": false,
204 |         "verified_type": -1,
205 |         "mbtype": 11,
206 |         "profile_url": "https:\/\/m.weibo.cn\/u\/5838706073?uid=5838706073",
207 |         "remark": ""
208 |       },
209 |       "text": "\u4ed6\u4eec\u4e00\u5bb6\u7b7e\u540d\u90fd\u662f\u540c\u6b3e\u5b57\u4f53\u554a<span class=\"url-icon\"><img src=\"\/\/h5.sinaimg.cn\/m\/emoticon\/icon\/default\/d_xiaoku-7430606cb7.png\" style=\"width:1em;height:1em;\" alt=\"[\u7b11cry]\"><\/span>",
210 |       "like_counts": 26,
211 |       "liked": false
212 |     },
213 |     {
214 |       "id": 4172543135276210,
215 |       "created_at": "5\u5206\u949f\u524d",
216 |       "source": "\u5fae\u535a weibo.com",
217 |       "user": {
218 |         "id": 3222817584,
219 |         "screen_name": "iPanda\u718a\u732b\u9891\u9053",
220 |         "profile_image_url": "https:\/\/tvax4.sinaimg.cn\/crop.0.0.199.199.180\/c0184b30ly1fib5h4nawmj205k05kaaj.jpg",
221 |         "verified": true,
222 |         "verified_type": 3,
223 |         "verified_type_ext": 0,
224 |         "mbtype": 12,
225 |         "profile_url": "https:\/\/m.weibo.cn\/u\/3222817584?uid=3222817584",
226 |         "remark": ""
227 |       },
228 |       "text": "\u539f\u6765\u662f\u5927\u718a\u732b\u201c\u53e4\u53e4\u201d\u201c\u63a5\u89c1\u201d\u7684\u7279\u6717\u666e\u592b\u4eba\u554a~<span class=\"url-icon\"><img src=\"\/\/h5.sinaimg.cn\/m\/emoticon\/icon\/default\/d_haha-bdd6ceb619.png\" style=\"width:1em;height:1em;\" alt=\"[\u54c8\u54c8]\"><\/span><span class=\"url-icon\"><img src=\"\/\/h5.sinaimg.cn\/m\/emoticon\/icon\/default\/d_haha-bdd6ceb619.png\" style=\"width:1em;height:1em;\" alt=\"[\u54c8\u54c8]\"><\/span><span class=\"url-icon\"><img src=\"\/\/h5.sinaimg.cn\/m\/emoticon\/icon\/default\/d_haha-bdd6ceb619.png\" style=\"width:1em;height:1em;\" alt=\"[\u54c8\u54c8]\"><\/span>",
229 |       "like_counts": 5,
230 |       "liked": false
231 |     },
232 |     {
233 |       "id": 4172544283687415,
234 |       "created_at": "1\u5206\u949f\u524d",
235 |       "source": "\u9b45\u65cf PRO 5",
236 |       "user": {
237 |         "id": 2136636343,
238 |         "screen_name": "\u7af9\u897fC",
239 |         "profile_image_url": "https:\/\/tva3.sinaimg.cn\/crop.0.19.233.233.180\/7f5a7bb7jw1ealqnreu9oj206j08bmxf.jpg",
240 |         "verified": false,
241 |         "verified_type": -1,
242 |         "mbtype": 0,
243 |         "profile_url": "https:\/\/m.weibo.cn\/u\/2136636343?uid=2136636343",
244 |         "remark": ""
245 |       },
246 |       "text": "\u4e0d\u662f\u840c\u5c0f\u561b<span class=\"url-icon\"><img src=\"\/\/h5.sinaimg.cn\/m\/emoticon\/icon\/default\/d_yunbei-c6964bf237.png\" style=\"width:1em;height:1em;\" alt=\"[\u5141\u60b2]\"><\/span><span class=\"url-icon\"><img src=\"\/\/h5.sinaimg.cn\/m\/emoticon\/icon\/default\/d_yunbei-c6964bf237.png\" style=\"width:1em;height:1em;\" alt=\"[\u5141\u60b2]\"><\/span><span class=\"url-icon\"><img src=\"\/\/h5.sinaimg.cn\/m\/emoticon\/icon\/default\/d_yunbei-c6964bf237.png\" style=\"width:1em;height:1em;\" alt=\"[\u5141\u60b2]\"><\/span>",
247 |       "like_counts": 0,
248 |       "liked": false
249 |     },
250 |     {
251 |       "id": 4172543521028345,
252 |       "created_at": "4\u5206\u949f\u524d",
253 |       "source": "OPPO R9",
254 |       "user": {
255 |         "id": 2605943347,
256 |         "screen_name": "\u732b\u4e0e\u69b4\u83b2\u7530baocl",
257 |         "profile_image_url": "https:\/\/tvax4.sinaimg.cn\/crop.0.0.996.996.180\/9b538a33ly8fl7mj42xsqj20ro0rodi7.jpg",
258 |         "verified": false,
259 |         "verified_type": -1,
260 |         "mbtype": 0,
261 |         "profile_url": "https:\/\/m.weibo.cn\/u\/2605943347?uid=2605943347",
262 |         "remark": ""
263 |       },
264 |       "text": "\u56fe\u4e09\u6211\u60f3\u8981\u539f\u56fe<span class=\"url-icon\"><img src=\"\/\/h5.sinaimg.cn\/m\/emoticon\/icon\/others\/d_miao-9ebe15b2c2.png\" style=\"width:1em;height:1em;\" alt=\"[\u55b5\u55b5]\"><\/span>",
265 |       "like_counts": 0,
266 |       "liked": false
267 |     },
268 |     {
269 |       "id": 4172539422399038,
270 |       "created_at": "20\u5206\u949f\u524d",
271 |       "source": "Weibo.intl",
272 |       "user": {
273 |         "id": 5510916469,
274 |         "screen_name": "\u738b\u98ce\u9c7c",
275 |         "profile_image_url": "https:\/\/tvax2.sinaimg.cn\/crop.0.0.736.736.180\/0060Xf7vly8fl4vru5lvmj30kg0kgab1.jpg",
276 |         "verified": false,
277 |         "verified_type": -1,
278 |         "mbtype": 0,
279 |         "profile_url": "https:\/\/m.weibo.cn\/u\/5510916469?uid=5510916469",
280 |         "remark": ""
281 |       },
282 |       "text": "\u5c0f\u5b69\u513f\u90fd\u4e0d\u7a7f\u68c9\u5927\u8863\u4e86\u3002\u7537\u751f\u7684\u5927\u80cc\u5934\u4e5f\u662f\u9ebb\u70e6\u53d1\u578b\u5e08\u4e86\u3002",
283 |       "like_counts": 4,
284 |       "liked": false
285 |     },
286 |     {
287 |       "id": 4172540798335754,
288 |       "created_at": "15\u5206\u949f\u524d",
289 |       "source": "\u5fae\u535a weibo.com",
290 |       "user": {
291 |         "id": 5581817548,
292 |         "screen_name": "M0DA1YE",
293 |         "profile_image_url": "https:\/\/tvax3.sinaimg.cn\/crop.173.0.639.639.180\/0065KJJily8ff7zxx617rj30sh0hrt9n.jpg",
294 |         "verified": false,
295 |         "verified_type": -1,
296 |         "mbtype": 12,
297 |         "profile_url": "https:\/\/m.weibo.cn\/u\/5581817548?uid=5581817548",
298 |         "remark": ""
299 |       },
300 |       "text": "\u7edf\u4e00\u670d\u88c5\uff1f\uff1f\uff1f",
301 |       "like_counts": 2,
302 |       "liked": false
303 |     }
304 |   ],
305 |   "hot_total_number": 6
306 | }
307 | 


--------------------------------------------------------------------------------
/WeiboCommentSpider/weibo_search_result_example.json:
--------------------------------------------------------------------------------
  1 | {
  2 |   "cardlistInfo": {
  3 |     "v_p": "42",
  4 |     "containerid": "100103type=2&q=\u7279\u6717\u666e",
  5 |     "title_top": "\u7279\u6717\u666e",
  6 |     "total": 1000,
  7 |     "show_style": 1,
  8 |     "starttime": 1510279670,
  9 |     "can_shared": 0,
 10 |     "cardlist_menus": [],
 11 |     "cardlist_head_cards": [],
 12 |     "toolbar_menus": [],
 13 |     "show_read_progress": null,
 14 |     "show_read_progress_stop": null,
 15 |     "page_size": 20,
 16 |     "page": 2
 17 |   },
 18 |   "cards": [
 19 |     {
 20 |       "card_type": 11,
 21 |       "show_type": 1,
 22 |       "card_group": [
 23 |         {
 24 |           "card_type": 9,
 25 |           "card_type_name": "\u5fae\u535a",
 26 |           "itemid": "seqid:1156122784|type:2|t:|pos:1-0-0|q:\u7279\u6717\u666e|ext:&mid=4172477577867420&",
 27 |           "actionlog": {
 28 |             "act_code": 554,
 29 |             "ext": "seqid:1156122784|type:2|t:|pos:1-0-0|q:\u7279\u6717\u666e|ext:&mid=4172477577867420&",
 30 |             "luicode": "",
 31 |             "uicode": "",
 32 |             "fid": "100103type=2&q=\u7279\u6717\u666e"
 33 |           },
 34 |           "display_arrow": 0,
 35 |           "show_type": 1,
 36 |           "mblog": {
 37 |             "created_at": "\u521a\u521a",
 38 |             "id": "4172477577867420",
 39 |             "mid": "4172477577867420",
 40 |             "idstr": "4172477577867420",
 41 |             "text": "\u3010\u7279\u6717\u666e\u4eca\u5929\u8bf4\u8bdd\u633a\u9760\u8c31[\u60a0\u95f2][\u5472\u7259]\u3011<br\/><br\/>\u4e3b\u5e2d\u5148\u751f\uff1a<br\/>\u975e\u5e38\u611f\u8c22\uff0c\u548c\u60a8\u5728\u4e00\u8d77\u5f88\u8363\u5e78\u3002<br\/>\u4e2d\u7f8e\u5173\u7cfb\u662f\u975e\u5e38\u91cd\u8981\u7684\u8bdd\u9898\uff0c\u6d89\u53ca\u6211\u4eec\u53cc\u65b9\u4e5f\u5305\u62ec\u5176\u4ed6\u7684\u4e00\u4e9b\u56fd\u5bb6\u3002\u6211\u4eec\u76f8\u4fe1\uff0c\u4e2d\u7f8e\u6709\u80fd\u529b\u5728\u4eca\u540e\u89e3\u51b3\u4e16\u754c\u95ee\u9898\u3002<br\/>\u6628\u5929\u665a\u4e0a\u7684\u4f1a\u6664\u662f\u975e\u5e38\u68d2\u7684\uff0c\u6211\u4eec\u7684\u665a\u9910\u65f6\u95f4\u8d85\u51fa\u4e86\u9884\u671f\u3002\u672c\u6765\u5b89\u6392\u4e8625\u5206\u949f\u7684\u665a\u9910\uff0c\u53ef\u4f60\u8fd9\u4e48\u53cb\u597d\uff0c\u665a\u5bb4\u6301\u7eed\u4e86 \u200b...<a href=\"\/status\/4172477577867420\">\u5168\u6587<\/a>",
 42 |             "textLength": 1241,
 43 |             "source": "HUAWEI Mate 8",
 44 |             "favorited": false,
 45 |             "is_paid": false,
 46 |             "mblog_vip_type": 0,
 47 |             "user": {
 48 |               "id": 1248005561,
 49 |               "screen_name": "\u65b0\u56db\u541b\u5fae\u8584\u529b\u91cf",
 50 |               "profile_image_url": "https:\/\/tva1.sinaimg.cn\/crop.0.93.394.394.180\/4a630db9jw8ectj89qv12j20ay0g4ju7.jpg",
 51 |               "profile_url": "https:\/\/m.weibo.cn\/u\/1248005561?uid=1248005561&luicode=10000011&lfid=100103type%3D2%26q%3D%E7%89%B9%E6%9C%97%E6%99%AE&featurecode=20000320",
 52 |               "statuses_count": 49623,
 53 |               "verified": false,
 54 |               "verified_type": 220,
 55 |               "close_blue_v": false,
 56 |               "description": "\u6e29\u5ba4\u82b1\u6735\u53ea\u662f\u89c2\u8d4f\u690d\u7269\uff0c\u680b\u6881\u4e4b\u624d\u9700\u8981\u98ce\u5439\u96e8\u6c90\u3002\u73b0\u5b9e\u4e0d\u53ea\u6709\u7b11\u8fd8\u5e94\u6709\u54ed\uff0c\u7ecf\u5386\u548c\u632b\u6298\u662f\u5b9d\u8d35\u8d22\u5bcc\u3002",
 57 |               "gender": "m",
 58 |               "mbtype": 0,
 59 |               "urank": 40,
 60 |               "mbrank": 0,
 61 |               "follow_me": false,
 62 |               "following": false,
 63 |               "followers_count": 8031,
 64 |               "follow_count": 2230,
 65 |               "cover_image_phone": "https:\/\/tva3.sinaimg.cn\/crop.0.0.640.640.640\/6ce2240djw1e9odcin216j20hs0hstd8.jpg",
 66 |               "avatar_hd": "https:\/\/ww1.sinaimg.cn\/orj480\/4a630db9jw8ectj89qv12j20ay0g4ju7.jpg"
 67 |             },
 68 |             "reposts_count": 0,
 69 |             "comments_count": 0,
 70 |             "attitudes_count": 0,
 71 |             "pending_approval_count": 0,
 72 |             "isLongText": true,
 73 |             "visible": {
 74 |               "type": 0,
 75 |               "list_id": 0
 76 |             },
 77 |             "rid": "0_0_0_2676184252325337115",
 78 |             "mlevelSource": "monitor",
 79 |             "more_info_type": 0,
 80 |             "status": 0,
 81 |             "itemid": "seqid:1156122784|type:2|t:|pos:1-0-0|q:\u7279\u6717\u666e|ext:&mid=4172477577867420&",
 82 |             "page_info": {
 83 |               "page_pic": {
 84 |                 "url": "https:\/\/ww4.sinaimg.cn\/large\/005BvWaMjw1eubrf1f94jj3050050mxb.jpg"
 85 |               },
 86 |               "page_url": "https:\/\/m.weibo.cn\/p\/index?containerid=1001018008636068100000002&title=%2525E5%252595%252586%2525E5%25259C%252588&luicode=10000011&lfid=100103type%3D2%26q%3D%E7%89%B9%E6%9C%97%E6%99%AE&featurecode=20000320",
 87 |               "page_title": "\u8d35\u6eaa\u00b7\u9e70\u6f6d",
 88 |               "content1": "",
 89 |               "content2": "",
 90 |               "type": "webpage"
 91 |             },
 92 |             "bid": "Fuptjx0FS"
 93 |           },
 94 |           "scheme": "https:\/\/m.weibo.cn\/status\/Fuptjx0FS?mblogid=Fuptjx0FS&luicode=10000011&lfid=100103type%3D2%26q%3D%E7%89%B9%E6%9C%97%E6%99%AE&featurecode=20000320"
 95 |         },
 96 |         {
 97 |           "card_type": 9,
 98 |           "card_type_name": "\u5fae\u535a",
 99 |           "itemid": "seqid:1156122784|type:2|t:|pos:1-0-1|q:\u7279\u6717\u666e|ext:&mid=4172477380932600&",
100 |           "actionlog": {
101 |             "act_code": 554,
102 |             "ext": "seqid:1156122784|type:2|t:|pos:1-0-1|q:\u7279\u6717\u666e|ext:&mid=4172477380932600&",
103 |             "luicode": "",
104 |             "uicode": "",
105 |             "fid": "100103type=2&q=\u7279\u6717\u666e"
106 |           },
107 |           "display_arrow": 0,
108 |           "show_type": 1,
109 |           "mblog": {
110 |             "created_at": "1\u5206\u949f\u524d",
111 |             "id": "4172477380932600",
112 |             "mid": "4172477380932600",
113 |             "idstr": "4172477380932600",
114 |             "text": "<a class='k' href='https:\/\/m.weibo.cn\/k\/%E7%89%B9%E6%9C%97%E6%99%AE%E8%AE%BF%E5%8D%8E?from=feed'>#\u7279\u6717\u666e\u8bbf\u534e#<\/a><a class='k' href='https:\/\/m.weibo.cn\/k\/%E7%89%B9%E6%9C%97%E6%99%AE%E6%9D%A5%E4%BA%86?from=feed'>#\u7279\u6717\u666e\u6765\u4e86#<\/a><br\/>\u7279\u6717\u666e\u8fd9\u4e00\u6ce2\uff0c\u53cc\u65b9\u6e05\u7a7a\u4e862535\u4ebf\u7f8e\u5143\u7684\u8d2d\u7269\u8f66\u3002<br\/><br\/>\u6295\u8d44\u7c7b\uff1a<br\/>\u897f\u5f17\u5409\u5c3c\u4e9a\u9875\u5ca9\u6c14\u5f00\u53d1\uff1a837\u4ebf\u7f8e\u5143<br\/>\u9655\u897f\u6986\u6797\u7164\u6db2\u5316\u9879\u76ee\uff1a117\u4ebf\u7f8e\u5143<br\/>\u963f\u62c9\u65af\u52a0\u6db2\u5316\u5929\u7136\u6c14\u5f00\u53d1\uff1a430\u4ebf\u7f8e\u5143<br\/><br\/>\u6d88\u8d39\u7c7b\uff1a<br\/>\u8d2d\u4e70\u7f8e\u56fd\u6db2\u5316\u5929\u7136\u6c14\uff1a110\u4ebf\u7f8e\u5143<br\/>\u8d2d\u4e70\u7f8e\u56fd\u4e59\u70f7\uff1a260\u4ebf\u7f8e\u5143<br\/>\u8d2d\u4e70\u6ce2\u97f3\u98de\u673a\uff1a370\u4ebf\u7f8e\u5143<br\/>\u8d2d\u4e70\u5ba2\u673a\u53d1\u52a8\u673a\uff1a \u200b...<a href=\"\/status\/4172477380932600\">\u5168\u6587<\/a>",
115 |             "textLength": 487,
116 |             "source": "Weibo.intl",
117 |             "favorited": false,
118 |             "thumbnail_pic": "http:\/\/wx1.sinaimg.cn\/thumbnail\/005OiyqRly1flcripbkpjj30go0g3js6.jpg",
119 |             "bmiddle_pic": "http:\/\/wx1.sinaimg.cn\/bmiddle\/005OiyqRly1flcripbkpjj30go0g3js6.jpg",
120 |             "original_pic": "http:\/\/wx1.sinaimg.cn\/large\/005OiyqRly1flcripbkpjj30go0g3js6.jpg",
121 |             "is_paid": false,
122 |             "mblog_vip_type": 0,
123 |             "user": {
124 |               "id": 5323903225,
125 |               "screen_name": "\u9752\u6850K",
126 |               "profile_image_url": "https:\/\/tvax3.sinaimg.cn\/crop.0.0.480.480.180\/005OiyqRly8fk8xegluatj30dc0dcgnv.jpg",
127 |               "profile_url": "https:\/\/m.weibo.cn\/u\/5323903225?uid=5323903225&luicode=10000011&lfid=100103type%3D2%26q%3D%E7%89%B9%E6%9C%97%E6%99%AE&featurecode=20000320",
128 |               "statuses_count": 13,
129 |               "verified": false,
130 |               "verified_type": -1,
131 |               "close_blue_v": false,
132 |               "description": "We are all in the gutter , but some of us are looking at the stars.",
133 |               "gender": "m",
134 |               "mbtype": 2,
135 |               "urank": 14,
136 |               "mbrank": 1,
137 |               "follow_me": false,
138 |               "following": false,
139 |               "followers_count": 109,
140 |               "follow_count": 168,
141 |               "cover_image_phone": "https:\/\/tva1.sinaimg.cn\/crop.0.0.640.640.640\/6cf8d7ebjw1ehfr4xa8psj20hs0hsgpg.jpg",
142 |               "avatar_hd": "https:\/\/wx3.sinaimg.cn\/orj480\/005OiyqRly8fk8xegluatj30dc0dcgnv.jpg"
143 |             },
144 |             "reposts_count": 0,
145 |             "comments_count": 0,
146 |             "attitudes_count": 0,
147 |             "pending_approval_count": 0,
148 |             "isLongText": true,
149 |             "visible": {
150 |               "type": 0,
151 |               "list_id": 0
152 |             },
153 |             "rid": "1_0_0_2676184252325337115",
154 |             "more_info_type": 0,
155 |             "status": 0,
156 |             "itemid": "seqid:1156122784|type:2|t:|pos:1-0-1|q:\u7279\u6717\u666e|ext:&mid=4172477380932600&",
157 |             "page_info": {
158 |               "page_pic": {
159 |                 "url": "https:\/\/wx2.sinaimg.cn\/thumbnail\/654b47daly1flawyuu3vgj2050050mxl.jpg"
160 |               },
161 |               "page_url": "https:\/\/m.weibo.cn\/p\/index?containerid=100808e025355b5f4f2f6264fa66697f7c3139&extparam=%E7%89%B9%E6%9C%97%E6%99%AE%E8%AE%BF%E5%8D%8E&luicode=10000011&lfid=100103type%3D2%26q%3D%E7%89%B9%E6%9C%97%E6%99%AE&featurecode=20000320",
162 |               "page_title": "#\u7279\u6717\u666e\u8bbf\u534e#",
163 |               "content1": "11\u67088\u65e5\u81f310\u65e5\uff0c\u7b2c\u4e00\u65f6\u95f4\u64ad\u62a5",
164 |               "content2": "462\u4eba\u5173\u6ce8",
165 |               "type": "topic"
166 |             },
167 |             "bid": "Fupt03UBW",
168 |             "pics": [
169 |               {
170 |                 "pid": "005OiyqRly1flcripbkpjj30go0g3js6",
171 |                 "url": "https:\/\/wx1.sinaimg.cn\/orj360\/005OiyqRly1flcripbkpjj30go0g3js6.jpg",
172 |                 "size": "orj360",
173 |                 "geo": {
174 |                   "width": 279,
175 |                   "height": 270,
176 |                   "croped": false
177 |                 },
178 |                 "large": {
179 |                   "size": "large",
180 |                   "url": "https:\/\/wx1.sinaimg.cn\/large\/005OiyqRly1flcripbkpjj30go0g3js6.jpg",
181 |                   "geo": {
182 |                     "width": "600",
183 |                     "height": "579",
184 |                     "croped": false
185 |                   }
186 |                 }
187 |               }
188 |             ]
189 |           },
190 |           "scheme": "https:\/\/m.weibo.cn\/status\/Fupt03UBW?mblogid=Fupt03UBW&luicode=10000011&lfid=100103type%3D2%26q%3D%E7%89%B9%E6%9C%97%E6%99%AE&featurecode=20000320"
191 |         },
192 |         {
193 |           "card_type": 9,
194 |           "card_type_name": "\u5fae\u535a",
195 |           "itemid": "seqid:1156122784|type:2|t:|pos:1-0-2|q:\u7279\u6717\u666e|ext:&mid=4172476239861040&",
196 |           "actionlog": {
197 |             "act_code": 554,
198 |             "ext": "seqid:1156122784|type:2|t:|pos:1-0-2|q:\u7279\u6717\u666e|ext:&mid=4172476239861040&",
199 |             "luicode": "",
200 |             "uicode": "",
201 |             "fid": "100103type=2&q=\u7279\u6717\u666e"
202 |           },
203 |           "display_arrow": 0,
204 |           "show_type": 1,
205 |           "mblog": {
206 |             "created_at": "5\u5206\u949f\u524d",
207 |             "id": "4172476239861040",
208 |             "mid": "4172476239861040",
209 |             "idstr": "4172476239861040",
210 |             "text": "<a class='k' href='https:\/\/m.weibo.cn\/k\/%E7%89%B9%E6%9C%97%E6%99%AE%E7%BB%93%E6%9D%9F%E8%AE%BF%E5%8D%8E?from=feed'>#\u7279\u6717\u666e\u7ed3\u675f\u8bbf\u534e#<\/a>\u53bbAPEC\uff0c<a class='k' href='https:\/\/m.weibo.cn\/k\/%E4%B8%AD%E7%BE%8E2535%E4%BA%BF%E5%A4%A7%E5%8D%95?from=feed'>#\u4e2d\u7f8e2535\u4ebf\u5927\u5355#<\/a>\u5305\u62ec\u54ea\u4e9b\uff1f<br\/>1\u3001\u4e2d\u822a\u6750\u4e0e\u6ce2\u97f3\u534f\u8bae370\u4ebf<br\/>2\u3001\u901a\u7528\u7535\u6c1435\u4ebf<br\/>3\u3001\u9ad8\u901a120\u4ebf<br\/>4\u3001\u4e2d\u6838\u96c6\u56e2\u4e0e\u7f8e\u56fd\u897f\u5c4b\u7535\u6c14\u516c\u53f8\u5546\u8ba8\u5408\u4f5c\u4e8b\u5b9c<br\/>5\u3001\u4e2d\u6295\u516c\u53f8\u4e0e\u9ad8\u76db\u96c6\u56e2\u5408\u4f5c\u57fa\u91d1\u91d1\u989d50\u4ebf<br\/>6\u3001\u6295\u8d44\u963f\u62c9\u65af\u52a0\u5dde\u5f00\u53d1\u6db2\u5316\u5929\u7136\u6c14430\u4ebf<br\/>7\u3001\u8fdb\u53e31200\u4e07\u5428\u5927\u8c4650\u4ebf<br\/>8\u3001\u897f\u5f17\u5dde\u6295\u8d44837\u4ebf<br\/>9\u3001UOP2.2\u4ebf<br\/>10 \u200b...<a href=\"\/status\/4172476239861040\">\u5168\u6587<\/a>",
211 |             "textLength": 778,
212 |             "source": "\u5fae\u535a weibo.com",
213 |             "favorited": false,
214 |             "thumbnail_pic": "http:\/\/wx1.sinaimg.cn\/thumbnail\/79442f45ly1flcrd3b7pxj20dw0afjrn.jpg",
215 |             "bmiddle_pic": "http:\/\/wx1.sinaimg.cn\/bmiddle\/79442f45ly1flcrd3b7pxj20dw0afjrn.jpg",
216 |             "original_pic": "http:\/\/wx1.sinaimg.cn\/large\/79442f45ly1flcrd3b7pxj20dw0afjrn.jpg",
217 |             "is_paid": false,
218 |             "mblog_vip_type": 0,
219 |             "user": {
220 |               "id": 2034511685,
221 |               "screen_name": "\u6c88\u9633\u73af\u7403\u6559\u80b2_",
222 |               "profile_image_url": "https:\/\/tvax1.sinaimg.cn\/crop.11.12.372.372.180\/79442f45ly8fjovanslgdj20ap0ap3zi.jpg",
223 |               "profile_url": "https:\/\/m.weibo.cn\/u\/2034511685?uid=2034511685&luicode=10000011&lfid=100103type%3D2%26q%3D%E7%89%B9%E6%9C%97%E6%99%AE&featurecode=20000320",
224 |               "statuses_count": 1705,
225 |               "verified": true,
226 |               "verified_type": 2,
227 |               "verified_type_ext": 0,
228 |               "verified_reason": "\u6c88\u9633\u73af\u7403\u96c5\u601d\u57f9\u8bad\u5b66\u6821",
229 |               "close_blue_v": false,
230 |               "description": "\u73af\u7403\u6559\u80b2\u662f\u5168\u56fd\u6700\u5927\u7684\u96c5\u601d\u9ad8\u5206\u57f9\u8bad\u57fa\u5730\uff0c\u6210\u4e3a\u5168\u7403\u7b2c\u4e00\u5bb6\u8de8\u56fd\u6559\u80b2\u4e0a\u5e02\u4f01\u4e1a\u3002\u6bcf\u6708\u5f00\u8bbeVIP\u79c1\u4eba\u5b9a\u5236\u73ed\u30013\u4eba\uff0c6\u4eba\uff0c10\u4eba\u73ed\u3002\u54a8\u8be2\u7535\u8bdd\uff1a400 900 3013",
231 |               "gender": "m",
232 |               "mbtype": 0,
233 |               "urank": 20,
234 |               "mbrank": 0,
235 |               "follow_me": false,
236 |               "following": false,
237 |               "followers_count": 1388,
238 |               "follow_count": 417,
239 |               "cover_image_phone": "https:\/\/tva1.sinaimg.cn\/crop.0.0.640.640.640\/549d0121tw1egm1kjly3jj20hs0hsq4f.jpg",
240 |               "avatar_hd": "https:\/\/wx1.sinaimg.cn\/orj480\/79442f45ly8fjovanslgdj20ap0ap3zi.jpg"
241 |             },
242 |             "reposts_count": 0,
243 |             "comments_count": 0,
244 |             "attitudes_count": 0,
245 |             "pending_approval_count": 0,
246 |             "isLongText": true,
247 |             "visible": {
248 |               "type": 0,
249 |               "list_id": 0
250 |             },
251 |             "rid": "3_0_0_2676184252325337115",
252 |             "more_info_type": 0,
253 |             "status": 0,
254 |             "itemid": "seqid:1156122784|type:2|t:|pos:1-0-2|q:\u7279\u6717\u666e|ext:&mid=4172476239861040&",
255 |             "page_info": {
256 |               "page_pic": {
257 |                 "url": "https:\/\/tva4.sinaimg.cn\/crop.0.0.2780.1566\/90eb2137ly1fl41rhctqsj225c17iwzv.jpg"
258 |               },
259 |               "page_url": "https:\/\/m.weibo.cn\/p\/index?containerid=23137500007546610938550273&luicode=10000011&lfid=100103type%3D2%26q%3D%E7%89%B9%E6%9C%97%E6%99%AE&featurecode=20000320",
260 |               "page_title": "\u3010\u65b0\u9c9c\u4e8b\u3011\u7279\u6717\u666e\u4e9a\u6d32\u884c",
261 |               "content1": "",
262 |               "content2": "",
263 |               "type": "webpage"
264 |             },
265 |             "bid": "Fupr9Fnj2",
266 |             "pics": [
267 |               {
268 |                 "pid": "79442f45ly1flcrd3b7pxj20dw0afjrn",
269 |                 "url": "https:\/\/wx1.sinaimg.cn\/orj360\/79442f45ly1flcrd3b7pxj20dw0afjrn.jpg",
270 |                 "size": "orj360",
271 |                 "geo": {
272 |                   "width": 360,
273 |                   "height": 270,
274 |                   "croped": false
275 |                 },
276 |                 "large": {
277 |                   "size": "large",
278 |                   "url": "https:\/\/wx1.sinaimg.cn\/large\/79442f45ly1flcrd3b7pxj20dw0afjrn.jpg",
279 |                   "geo": {
280 |                     "width": "500",
281 |                     "height": "375",
282 |                     "croped": false
283 |                   }
284 |                 }
285 |               }
286 |             ]
287 |           },
288 |           "scheme": "https:\/\/m.weibo.cn\/status\/Fupr9Fnj2?mblogid=Fupr9Fnj2&luicode=10000011&lfid=100103type%3D2%26q%3D%E7%89%B9%E6%9C%97%E6%99%AE&featurecode=20000320"
289 |         },
290 |         {
291 |           "card_type": 9,
292 |           "card_type_name": "\u5fae\u535a",
293 |           "itemid": "seqid:1156122784|type:2|t:|pos:1-0-3|q:\u7279\u6717\u666e|ext:&mid=4172476231452360&",
294 |           "actionlog": {
295 |             "act_code": 554,
296 |             "ext": "seqid:1156122784|type:2|t:|pos:1-0-3|q:\u7279\u6717\u666e|ext:&mid=4172476231452360&",
297 |             "luicode": "",
298 |             "uicode": "",
299 |             "fid": "100103type=2&q=\u7279\u6717\u666e"
300 |           },
301 |           "display_arrow": 0,
302 |           "show_type": 1,
303 |           "mblog": {
304 |             "created_at": "5\u5206\u949f\u524d",
305 |             "id": "4172476231452360",
306 |             "mid": "4172476231452360",
307 |             "idstr": "4172476231452360",
308 |             "text": "\u5ddd\u666e \u7279\u6717\u666e \u90fd\u6210\u654f\u611f\u8bcd\u4e86\u3002 \u200b",
309 |             "textLength": 26,
310 |             "source": "\u5fae\u535a weibo.com",
311 |             "favorited": false,
312 |             "is_paid": false,
313 |             "mblog_vip_type": 0,
314 |             "user": {
315 |               "id": 2299119915,
316 |               "screen_name": "\u62e5\u62a4\u4f1f\u5927\u9886\u8896",
317 |               "profile_image_url": "https:\/\/tvax3.sinaimg.cn\/crop.192.126.481.481.180\/8909c92bly8fjnskl84c9j20o20kejt4.jpg",
318 |               "profile_url": "https:\/\/m.weibo.cn\/u\/2299119915?uid=2299119915&luicode=10000011&lfid=100103type%3D2%26q%3D%E7%89%B9%E6%9C%97%E6%99%AE&featurecode=20000320",
319 |               "statuses_count": 431,
320 |               "verified": false,
321 |               "verified_type": -1,
322 |               "close_blue_v": false,
323 |               "description": "\u4eba\u95f4\u6b63\u9053\u662f\u6ca7\u6851\u3002",
324 |               "gender": "m",
325 |               "mbtype": 2,
326 |               "urank": 9,
327 |               "mbrank": 1,
328 |               "follow_me": false,
329 |               "following": false,
330 |               "followers_count": 170,
331 |               "follow_count": 352,
332 |               "cover_image_phone": "https:\/\/tva3.sinaimg.cn\/crop.0.0.640.640.640\/68f96449tw1egwcah85a8j20hs0hsdic.jpg",
333 |               "avatar_hd": "https:\/\/wx3.sinaimg.cn\/orj480\/8909c92bly8fjnskl84c9j20o20kejt4.jpg"
334 |             },
335 |             "reposts_count": 0,
336 |             "comments_count": 0,
337 |             "attitudes_count": 0,
338 |             "pending_approval_count": 0,
339 |             "isLongText": false,
340 |             "visible": {
341 |               "type": 0,
342 |               "list_id": 0
343 |             },
344 |             "rid": "4_0_0_2676184252325337115",
345 |             "more_info_type": 0,
346 |             "status": 0,
347 |             "itemid": "seqid:1156122784|type:2|t:|pos:1-0-3|q:\u7279\u6717\u666e|ext:&mid=4172476231452360&",
348 |             "bid": "Fupr965Pa"
349 |           },
350 |           "scheme": "https:\/\/m.weibo.cn\/status\/Fupr965Pa?mblogid=Fupr965Pa&luicode=10000011&lfid=100103type%3D2%26q%3D%E7%89%B9%E6%9C%97%E6%99%AE&featurecode=20000320"
351 |         },
352 |         {
353 |           "card_type": 9,
354 |           "card_type_name": "\u5fae\u535a",
355 |           "itemid": "seqid:1156122784|type:2|t:|pos:1-0-4|q:\u7279\u6717\u666e|ext:&mid=4172476159858375&",
356 |           "actionlog": {
357 |             "act_code": 554,
358 |             "ext": "seqid:1156122784|type:2|t:|pos:1-0-4|q:\u7279\u6717\u666e|ext:&mid=4172476159858375&",
359 |             "luicode": "",
360 |             "uicode": "",
361 |             "fid": "100103type=2&q=\u7279\u6717\u666e"
362 |           },
363 |           "display_arrow": 0,
364 |           "show_type": 1,
365 |           "mblog": {
366 |             "created_at": "6\u5206\u949f\u524d",
367 |             "id": "4172476159858375",
368 |             "mid": "4172476159858375",
369 |             "idstr": "4172476159858375",
370 |             "text": "\u7279\u6717\u666e\uff1a\u672c\u676525\u5206\u949f\u7684\u665a\u9910\u6301\u7eed2\u5c0f\u65f6 \u975e\u5e38\u4eab\u53d7 <a data-url=\"http:\/\/t.cn\/RlR9fnH\" target=\"_blank\" href=\"http:\/\/weibo.cn\/sinaurl\/blocked9e1dd694?url=http%3A%2F%2Fiphone.myzaker.com%2Fl.php%3Fl%3D5a03e91d9490cbd97b000031&sinainternalbrowser=topnav&share_menu=1&url_type=39&object_type=webpage&pos=1&luicode=10000011&lfid=100103type%3D2%26q%3D%E7%89%B9%E6%9C%97%E6%99%AE&featurecode=20000320&u=http%3A%2F%2Fiphone.myzaker.com%2Fl.php%3Fl%3D5a03e91d9490cbd97b000031&ep=Fupr1FmC3%2C1709297804%2CFupr1FmC3%2C1709297804\" class=\"\"><span class=\"url-icon\"><img src=\"https:\/\/h5.sinaimg.cn\/upload\/2015\/09\/25\/3\/timeline_card_small_web_default.png\"><\/span><\/i><span class=\"surl-text\">\u7279\u6717\u666e\uff1a\u672c\u676525\u5206\u949f\u7684\u665a\u9910\u6301\u7eed2\u5c0f\u65f6 \u975e\u5e38\u4eab\u53d7<\/a> \u200b",
371 |             "textLength": 62,
372 |             "source": "ZAKER\u624e\u5ba2Android\u7248",
373 |             "favorited": false,
374 |             "is_paid": false,
375 |             "mblog_vip_type": 0,
376 |             "user": {
377 |               "id": 1709297804,
378 |               "screen_name": "J-\u6a3e\u7498",
379 |               "profile_image_url": "https:\/\/tva3.sinaimg.cn\/crop.0.0.180.180.180\/65e1d08cjw1e8qgp5bmzyj2050050aa8.jpg",
380 |               "profile_url": "https:\/\/m.weibo.cn\/u\/1709297804?uid=1709297804&luicode=10000011&lfid=100103type%3D2%26q%3D%E7%89%B9%E6%9C%97%E6%99%AE&featurecode=20000320",
381 |               "statuses_count": 5235,
382 |               "verified": false,
383 |               "verified_type": 220,
384 |               "close_blue_v": false,
385 |               "description": "\u5b9e\u4e4b\u534e\u4e4b\u5179\u4e43\u517c\u6c42\uff0c\u987a\u98ce\u516e\u9006\u98ce\u516e\u65e0\u963b\u6211\u98de\u626c\u3002",
386 |               "gender": "f",
387 |               "mbtype": 0,
388 |               "urank": 35,
389 |               "mbrank": 0,
390 |               "follow_me": false,
391 |               "following": false,
392 |               "followers_count": 1562,
393 |               "follow_count": 374,
394 |               "cover_image_phone": "https:\/\/tva1.sinaimg.cn\/crop.0.0.640.640.640\/549d0121tw1egm1kjly3jj20hs0hsq4f.jpg",
395 |               "avatar_hd": "https:\/\/ww3.sinaimg.cn\/orj480\/65e1d08cjw1e8qgp5bmzyj2050050aa8.jpg"
396 |             },
397 |             "reposts_count": 0,
398 |             "comments_count": 0,
399 |             "attitudes_count": 0,
400 |             "pending_approval_count": 0,
401 |             "isLongText": false,
402 |             "visible": {
403 |               "type": 0,
404 |               "list_id": 0
405 |             },
406 |             "rid": "5_0_0_2676184252325337115",
407 |             "more_info_type": 0,
408 |             "status": 0,
409 |             "itemid": "seqid:1156122784|type:2|t:|pos:1-0-4|q:\u7279\u6717\u666e|ext:&mid=4172476159858375&",
410 |             "page_info": {
411 |               "page_pic": {
412 |                 "url": "http:\/\/zkres.myzaker.com\/data\/ads_web\/share_pic.png"
413 |               },
414 |               "page_url": "http:\/\/weibo.cn\/sinaurl\/blocked295ca5d8?url=http%3A%2F%2Fiphone.myzaker.com%2Fl.php%3Fl%3D5a03e91d9490cbd97b000031&sinainternalbrowser=topnav&share_menu=1&url_type=39&object_type=webpage&pos=2&luicode=10000011&lfid=100103type%3D2%26q%3D%E7%89%B9%E6%9C%97%E6%99%AE&featurecode=20000320&u=http%3A%2F%2Fiphone.myzaker.com%2Fl.php%3Fl%3D5a03e91d9490cbd97b000031",
415 |               "page_title": "\u7279\u6717\u666e\uff1a\u672c\u676525\u5206\u949f\u7684\u665a\u9910\u6301\u7eed2\u5c0f\u65f6 \u975e\u5e38\u4eab\u53d7",
416 |               "content1": "\u7279\u6717\u666e\uff1a\u672c\u676525\u5206\u949f\u7684\u665a\u9910\u6301\u7eed2\u5c0f\u65f6 \u975e\u5e38\u4eab\u53d7",
417 |               "content2": "\u4eca\u5929(11\u67089\u65e5)\u4e0a\u5348\uff0c\u56fd\u5bb6\u4e3b\u5e2d\u4e60\u8fd1\u5e73\u4e0e\u7f8e\u56fd\u603b\u7edf\u7279\u6717\u666e\u5728\u4eba\u6c11\u5927\u4f1a\u5802\u4e3e\u884c\u4f1a\u8c08\u3002\u7f8e\u56fd\u603b\u7edf\u7279\u6717\u666e\uff1a\u4e3b\u5e2d\u5148\u751f\uff0c\u975e\u5e38\u611f\u8c22\uff0c\u548c\u60a8\u5728\u4e00\u8d77\u5f88\u8363\u5e78\u3002\u4e2d\u7f8e\u5173\u7cfb\u662f\u975e\u5e38\u91cd\u8981\u7684\u8bdd\u9898\uff0c\u6d89\u53ca\u6211\u4eec\u53cc\u65b9\u4e5f\u5305\u62ec\u5176\u4ed6\u4e00\u4e9b\u56fd\u5bb6\u3002\u6211\u4eec\u76f8\u4fe1\uff0c\u4e2d\u7f8e\u6709\u80fd\u529b\u5728\u4eca\u540e\u89e3\u51b3\u4e16\u754c\u95ee\u9898\u3002\u6628\u5929\u665a\u4e0a\u7684\u4f1a\u6664\u662f\u975e\u5e38\u68d2\u7684\uff0c\u6211\u4eec\u7684\u665a\u9910\u65f6\u95f4\u8d85\u51fa\u4e86\u9884\u671f\u3002\u672c\u6765\u5b89\u6392\u4e8625\u5206\u949f\u7684\u665a\u9910\uff0c\u53ef\u4f60\u8fd9\u4e48\u53cb\u597d\uff0c\u665a\u5bb4\u6301\u7eed\u4e86\u81f3\u5c11\u4e24\u4e2a\u5c0f\u65f6\u3002\u548c\u60a8\u548c\u60a8\u7684\u592b\u4eba\u4e00\u8d77\uff0c\u6bcf\u4e00\u5206\u949f\u6211\u4eec\u90fd\u975e\u5e38\u4eab\u53d7\u3002\u6211\u4eec\u7684\u5173\u7cfb\u662f...",
418 |               "type": "webpage"
419 |             },
420 |             "bid": "Fupr1FmC3"
421 |           },
422 |           "scheme": "https:\/\/m.weibo.cn\/status\/Fupr1FmC3?mblogid=Fupr1FmC3&luicode=10000011&lfid=100103type%3D2%26q%3D%E7%89%B9%E6%9C%97%E6%99%AE&featurecode=20000320"
423 |         },
424 |         {
425 |           "card_type": 9,
426 |           "card_type_name": "\u5fae\u535a",
427 |           "itemid": "seqid:1156122784|type:2|t:|pos:1-0-5|q:\u7279\u6717\u666e|ext:&mid=4172476063590165&",
428 |           "actionlog": {
429 |             "act_code": 554,
430 |             "ext": "seqid:1156122784|type:2|t:|pos:1-0-5|q:\u7279\u6717\u666e|ext:&mid=4172476063590165&",
431 |             "luicode": "",
432 |             "uicode": "",
433 |             "fid": "100103type=2&q=\u7279\u6717\u666e"
434 |           },
435 |           "display_arrow": 0,
436 |           "show_type": 1,
437 |           "mblog": {
438 |             "created_at": "6\u5206\u949f\u524d",
439 |             "id": "4172476063590165",
440 |             "mid": "4172476063590165",
441 |             "idstr": "4172476063590165",
442 |             "text": "\u7279\u6717\u666e\u9996\u6b21\u8bbf\u534e - \u6211\u6b63\u5728\u770b\u4e13\u9898\uff1a\u300a\u7279\u6717\u666e\u9996\u6b21\u8bbf\u534e \u5df2\u62b5\u8fbe\u9996\u90fd\u673a\u573a\u300b  \u7f8e\u56fd\u8fd8\u6b20\u4e2d\u56fd\u7684\u503a\u52a16\u5343\u591a\u4ebf\u5143\uff0c\u4e3a\u4f55\u8fd8\u8981\u548c\u5b83\u7b7e\u8ba22\u5343\u591a\u4ebf\u9879\u76ee\u5408\u540c\uff0c\u4e0d\u6015\u7f8e\u56fd\u4eba\u8fd8\u4e0d\u8d77\u5417 <a data-url=\"http:\/\/t.cn\/RluFKco\" href=\"http:\/\/feed.mix.sina.com.cn\/link_card\/redirect?url=http%3A%2F%2Fnews.sina.com.cn%2Fzt_d%2Ftrumpchina171108%2F%3Fbsh_bid%3D1866157467&sendweibouid=5602219262&url=http%3A%2F%2Ffeed.mix.sina.com.cn%2Flink_card%2Fredirect%3Furl%3Dhttp%253A%252F%252Fnews.sina.com.cn%252Fzt_d%252Ftrumpchina171108%252F%253Fbsh_bid%253D1866157467%26sendweibouid%3D5602219262&sinainternalbrowser=topnav&share_menu=1&url_type=39&object_type=webpage&pos=1&luicode=10000011&lfid=100103type%3D2%26q%3D%E7%89%B9%E6%9C%97%E6%99%AE&featurecode=20000320&ep=FupqSf3XT%2C5602219262%2CFupqSf3XT%2C5602219262\" data-hide=\"\"><span class=\"url-icon\"><img src=\"https:\/\/h5.sinaimg.cn\/upload\/2015\/09\/25\/3\/timeline_card_small_web_default.png\"><\/span><\/i><span class=\"surl-text\">\u7279\u6717\u666e\u9996\u6b21\u8bbf\u534e<\/a> \u200b",
443 |             "textLength": 166,
444 |             "source": "bShare\u5206\u4eab",
445 |             "favorited": false,
446 |             "is_paid": false,
447 |             "mblog_vip_type": 0,
448 |             "user": {
449 |               "id": 5602219262,
450 |               "screen_name": "\u5c0f\u8bf4\u5bb675152",
451 |               "profile_image_url": "https:\/\/tva2.sinaimg.cn\/crop.1.45.636.636.180\/00678l9cgw1f0phvt4fsjj30hs0np0w1.jpg",
452 |               "profile_url": "https:\/\/m.weibo.cn\/u\/5602219262?uid=5602219262&luicode=10000011&lfid=100103type%3D2%26q%3D%E7%89%B9%E6%9C%97%E6%99%AE&featurecode=20000320",
453 |               "statuses_count": 11882,
454 |               "verified": false,
455 |               "verified_type": -1,
456 |               "close_blue_v": false,
457 |               "description": "\u4e00\u4e2a\u4e0d\u7518\u5bc2\u5bde\u662f\u4f60\u4e0d\u611f\u5174\u8da3\u800c\u52aa\u529b\u8ffd\u5bfb\u81ea\u5df1\u68a6\u5883\u7684\u4eba",
458 |               "gender": "m",
459 |               "mbtype": 0,
460 |               "urank": 32,
461 |               "mbrank": 0,
462 |               "follow_me": false,
463 |               "following": false,
464 |               "followers_count": 1156,
465 |               "follow_count": 1998,
466 |               "cover_image_phone": "https:\/\/tva1.sinaimg.cn\/crop.0.0.640.640.640\/549d0121tw1egm1kjly3jj20hs0hsq4f.jpg",
467 |               "avatar_hd": "https:\/\/ww2.sinaimg.cn\/orj480\/00678l9cgw1f0phvt4fsjj30hs0np0w1.jpg"
468 |             },
469 |             "reposts_count": 0,
470 |             "comments_count": 0,
471 |             "attitudes_count": 0,
472 |             "pending_approval_count": 0,
473 |             "isLongText": false,
474 |             "visible": {
475 |               "type": 0,
476 |               "list_id": 0
477 |             },
478 |             "rid": "6_0_0_2676184252325337115",
479 |             "more_info_type": 0,
480 |             "status": 0,
481 |             "itemid": "seqid:1156122784|type:2|t:|pos:1-0-5|q:\u7279\u6717\u666e|ext:&mid=4172476063590165&",
482 |             "page_info": {
483 |               "page_pic": {
484 |                 "url": "https:\/\/tva4.sinaimg.cn\/crop.0.0.2780.1566\/90eb2137ly1fl41rhctqsj225c17iwzv.jpg"
485 |               },
486 |               "page_url": "https:\/\/m.weibo.cn\/p\/index?containerid=23137500007546610938550273&luicode=10000011&lfid=100103type%3D2%26q%3D%E7%89%B9%E6%9C%97%E6%99%AE&featurecode=20000320",
487 |               "page_title": "\u3010\u65b0\u9c9c\u4e8b\u3011\u7279\u6717\u666e\u4e9a\u6d32\u884c",
488 |               "content1": "",
489 |               "content2": "",
490 |               "type": "webpage"
491 |             },
492 |             "bid": "FupqSf3XT"
493 |           },
494 |           "scheme": "https:\/\/m.weibo.cn\/status\/FupqSf3XT?mblogid=FupqSf3XT&luicode=10000011&lfid=100103type%3D2%26q%3D%E7%89%B9%E6%9C%97%E6%99%AE&featurecode=20000320"
495 |         },
496 |         {
497 |           "card_type": 9,
498 |           "card_type_name": "\u5fae\u535a",
499 |           "itemid": "seqid:1156122784|type:2|t:|pos:1-0-6|q:\u7279\u6717\u666e|ext:&mid=4172476042870320&",
500 |           "actionlog": {
501 |             "act_code": 554,
502 |             "ext": "seqid:1156122784|type:2|t:|pos:1-0-6|q:\u7279\u6717\u666e|ext:&mid=4172476042870320&",
503 |             "luicode": "",
504 |             "uicode": "",
505 |             "fid": "100103type=2&q=\u7279\u6717\u666e"
506 |           },
507 |           "display_arrow": 0,
508 |           "show_type": 1,
509 |           "mblog": {
510 |             "created_at": "6\u5206\u949f\u524d",
511 |             "id": "4172476042870320",
512 |             "mid": "4172476042870320",
513 |             "idstr": "4172476042870320",
514 |             "text": "\u3010\u65e9\u5b89\u301111\u670810\u65e5\u65f6\u653f\u70ed\u70b9\u77e5\u8bc6\u79ef\u7d2f\uff1a<a data-url=\"http:\/\/t.cn\/Rl3vSxo\" target=\"_blank\" href=\"https:\/\/weibo.cn\/sinaurl\/blockedd7192a92?luicode=10000011&lfid=100103type%3D2%26q%3D%E7%89%B9%E6%9C%97%E6%99%AE&featurecode=20000320&u=http%3A%2F%2Fhlj.huatu.com%2Fszrd%2F20171110%2F1502841.html&ep=FupqQc2Hu%2C3859325228%2CFupqQc2Hu%2C3859325228\" class=\"\"><span class=\"url-icon\"><img src=\"\/\/h5.sinaimg.cn\/upload\/2015\/09\/25\/3\/timeline_card_small_web_default.png\"><\/span><\/i><span class=\"surl-text\">\u7f51\u9875\u94fe\u63a5<\/a>\u3000<span class=\"url-icon\"><img src=\"\/\/h5.sinaimg.cn\/m\/emoticon\/icon\/default\/d_chigua-90cb948c34.png\" style=\"width:1em;height:1em;\" alt=\"[\u5403\u74dc]\"><\/span>\u8981\u95fb\uff1a9\u65e5\uff0c\u4e60\u8fd1\u5e73\u5728\u4eba\u6c11\u5927\u4f1a\u5802\u540c\u7f8e\u56fd\u603b\u7edf\u7279\u6717\u666e\u4e3e\u884c\u4f1a\u8c08\u3002\u4f1a\u8c08\u4e2d\uff0c\u4e24\u56fd\u5143\u9996\u5c31\u52a0\u5f3a\u4e2d\u7f8e\u53cc\u8fb9\u3001\u5730\u533a\u548c\u5168\u7403\u5c42\u9762\u5408\u4f5c\u8fbe\u6210\u591a\u9879\u91cd\u8981\u6210\u679c\u548c\u5171\u8bc6\u3002 \u200b",
515 |             "textLength": 190,
516 |             "source": "\u5fae\u535a weibo.com",
517 |             "favorited": false,
518 |             "thumbnail_pic": "http:\/\/wx2.sinaimg.cn\/thumbnail\/e608a12cly1flcrd5fh2qj20j60y3jtk.jpg",
519 |             "bmiddle_pic": "http:\/\/wx2.sinaimg.cn\/bmiddle\/e608a12cly1flcrd5fh2qj20j60y3jtk.jpg",
520 |             "original_pic": "http:\/\/wx2.sinaimg.cn\/large\/e608a12cly1flcrd5fh2qj20j60y3jtk.jpg",
521 |             "is_paid": false,
522 |             "mblog_vip_type": 0,
523 |             "user": {
524 |               "id": 3859325228,
525 |               "screen_name": "\u54c8\u5c14\u6ee8\u534e\u56fe",
526 |               "profile_image_url": "https:\/\/tva2.sinaimg.cn\/crop.0.0.180.180.180\/e608a12cjw8eswds55qnoj20500500sr.jpg",
527 |               "profile_url": "https:\/\/m.weibo.cn\/u\/3859325228?uid=3859325228&luicode=10000011&lfid=100103type%3D2%26q%3D%E7%89%B9%E6%9C%97%E6%99%AE&featurecode=20000320",
528 |               "statuses_count": 2360,
529 |               "verified": true,
530 |               "verified_type": 2,
531 |               "verified_type_ext": 0,
532 |               "verified_reason": "\u5317\u4eac\u534e\u56fe\u5b8f\u9633\u6559\u80b2\u6587\u5316\u53d1\u5c55\u80a1\u4efd\u6709\u9650\u516c\u53f8\u54c8\u5c14\u6ee8\u5206\u516c\u53f8",
533 |               "close_blue_v": false,
534 |               "description": "\u9ed1\u9f99\u6c5f\u534e\u56fe\u6559\u80b2",
535 |               "gender": "m",
536 |               "mbtype": 0,
537 |               "urank": 14,
538 |               "mbrank": 0,
539 |               "follow_me": false,
540 |               "following": false,
541 |               "followers_count": 12058,
542 |               "follow_count": 181,
543 |               "cover_image_phone": "https:\/\/tva1.sinaimg.cn\/crop.0.0.640.640.640\/549d0121tw1egm1kjly3jj20hs0hsq4f.jpg",
544 |               "avatar_hd": "https:\/\/ww2.sinaimg.cn\/orj480\/e608a12cjw8eswds55qnoj20500500sr.jpg"
545 |             },
546 |             "reposts_count": 0,
547 |             "comments_count": 0,
548 |             "attitudes_count": 0,
549 |             "pending_approval_count": 0,
550 |             "isLongText": false,
551 |             "visible": {
552 |               "type": 0,
553 |               "list_id": 0
554 |             },
555 |             "rid": "7_0_0_2676184252325337115",
556 |             "mlevelSource": "monitor",
557 |             "more_info_type": 0,
558 |             "status": 0,
559 |             "itemid": "seqid:1156122784|type:2|t:|pos:1-0-6|q:\u7279\u6717\u666e|ext:&mid=4172476042870320&",
560 |             "bid": "FupqQc2Hu",
561 |             "pics": [
562 |               {
563 |                 "pid": "e608a12cly1flcrd5fh2qj20j60y3jtk",
564 |                 "url": "https:\/\/wx2.sinaimg.cn\/orj360\/e608a12cly1flcrd5fh2qj20j60y3jtk.jpg",
565 |                 "size": "orj360",
566 |                 "geo": {
567 |                   "width": 360,
568 |                   "height": 640,
569 |                   "croped": false
570 |                 },
571 |                 "large": {
572 |                   "size": "large",
573 |                   "url": "https:\/\/wx2.sinaimg.cn\/large\/e608a12cly1flcrd5fh2qj20j60y3jtk.jpg",
574 |                   "geo": {
575 |                     "width": "690",
576 |                     "height": "1227",
577 |                     "croped": false
578 |                   }
579 |                 }
580 |               }
581 |             ]
582 |           },
583 |           "scheme": "https:\/\/m.weibo.cn\/status\/FupqQc2Hu?mblogid=FupqQc2Hu&luicode=10000011&lfid=100103type%3D2%26q%3D%E7%89%B9%E6%9C%97%E6%99%AE&featurecode=20000320"
584 |         },
585 |         {
586 |           "card_type": 9,
587 |           "card_type_name": "\u5fae\u535a",
588 |           "itemid": "seqid:1156122784|type:2|t:|pos:1-0-7|q:\u7279\u6717\u666e|ext:&mid=4172475753600035&",
589 |           "actionlog": {
590 |             "act_code": 554,
591 |             "ext": "seqid:1156122784|type:2|t:|pos:1-0-7|q:\u7279\u6717\u666e|ext:&mid=4172475753600035&",
592 |             "luicode": "",
593 |             "uicode": "",
594 |             "fid": "100103type=2&q=\u7279\u6717\u666e"
595 |           },
596 |           "display_arrow": 0,
597 |           "show_type": 1,
598 |           "mblog": {
599 |             "created_at": "7\u5206\u949f\u524d",
600 |             "id": "4172475753600035",
601 |             "mid": "4172475753600035",
602 |             "idstr": "4172475753600035",
603 |             "text": "\u7279\u6717\u666e10\u65e5\u7ed3\u675f\u5bf9\u4e2d\u56fd\u7684\u56fd\u4e8b\u8bbf\u95ee \u4e58\u4e13\u673a\u79bb\u5f00\u5317\u4eac_\u7f51\u6613\u65b0\u95fb \uff08\u5206\u4eab\u81ea <a href='https:\/\/m.weibo.cn\/n\/\u7f51\u6613\u65b0\u95fb'>@\u7f51\u6613\u65b0\u95fb<\/a>\uff09 <a data-url=\"http:\/\/t.cn\/RlukxME\" target=\"_blank\" href=\"https:\/\/weibo.cn\/sinaurl\/blocked6563a786?luicode=10000011&lfid=100103type%3D2%26q%3D%E7%89%B9%E6%9C%97%E6%99%AE&featurecode=20000320&u=http%3A%2F%2Fnews.163.com%2F17%2F1110%2F09%2FD2SDLPCU0001899O.html%23sns_weibo<a class='k' href='https:\/\/m.weibo.cn\/k\/sns_weibo?from=feed'>#sns_weibo#<\/a>sns_weibo&ep=Fupqnf6x5%2C2978480200%2CFupqnf6x5%2C2978480200\" class=\"\"><span class=\"url-icon\"><img src=\"\/\/h5.sinaimg.cn\/upload\/2015\/09\/25\/3\/timeline_card_small_web_default.png\"><\/span><\/i><span class=\"surl-text\">\u7279\u6717\u666e10\u65e5\u7ed3\u675f\u5bf9\u4e2d\u56fd\u7684\u56fd\u4e8b\u8bbf\u95ee \u4e58\u4e13\u673a\u79bb\u5f00\u5317\u4eac_\u7f51\u6613\u65b0\u95fb<\/a> \u200b",
604 |             "textLength": 95,
605 |             "source": "\u7f51\u6613\u65b0\u95fb",
606 |             "favorited": false,
607 |             "is_paid": false,
608 |             "mblog_vip_type": 0,
609 |             "user": {
610 |               "id": 2978480200,
611 |               "screen_name": "\u66fe\u601d\u6e90555",
612 |               "profile_image_url": "https:\/\/tvax1.sinaimg.cn\/crop.14.0.721.721.180\/b1880048ly8fgjnua4z1gj20ku0k1gn4.jpg",
613 |               "profile_url": "https:\/\/m.weibo.cn\/u\/2978480200?uid=2978480200&luicode=10000011&lfid=100103type%3D2%26q%3D%E7%89%B9%E6%9C%97%E6%99%AE&featurecode=20000320",
614 |               "statuses_count": 15064,
615 |               "verified": false,
616 |               "verified_type": 220,
617 |               "close_blue_v": false,
618 |               "description": "\u996e\u6c34\u601d\u6e90\uff0c\u5c0a\u656c\u81f3\u4e0a\uff01",
619 |               "gender": "m",
620 |               "mbtype": 0,
621 |               "urank": 39,
622 |               "mbrank": 0,
623 |               "follow_me": false,
624 |               "following": false,
625 |               "followers_count": 2278,
626 |               "follow_count": 1205,
627 |               "cover_image_phone": "https:\/\/tva1.sinaimg.cn\/crop.0.0.640.640.640\/549d0121tw1egm1kjly3jj20hs0hsq4f.jpg",
628 |               "avatar_hd": "https:\/\/wx1.sinaimg.cn\/orj480\/b1880048ly8fgjnua4z1gj20ku0k1gn4.jpg"
629 |             },
630 |             "reposts_count": 0,
631 |             "comments_count": 0,
632 |             "attitudes_count": 0,
633 |             "pending_approval_count": 0,
634 |             "isLongText": false,
635 |             "visible": {
636 |               "type": 0,
637 |               "list_id": 0
638 |             },
639 |             "rid": "8_0_0_2676184252325337115",
640 |             "more_info_type": 0,
641 |             "status": 0,
642 |             "itemid": "seqid:1156122784|type:2|t:|pos:1-0-7|q:\u7279\u6717\u666e|ext:&mid=4172475753600035&",
643 |             "bid": "Fupqnf6x5"
644 |           },
645 |           "scheme": "https:\/\/m.weibo.cn\/status\/Fupqnf6x5?mblogid=Fupqnf6x5&luicode=10000011&lfid=100103type%3D2%26q%3D%E7%89%B9%E6%9C%97%E6%99%AE&featurecode=20000320"
646 |         },
647 |         {
648 |           "card_type": 9,
649 |           "card_type_name": "\u5fae\u535a",
650 |           "itemid": "seqid:1156122784|type:2|t:|pos:1-0-8|q:\u7279\u6717\u666e|ext:&mid=4172475753152500&",
651 |           "actionlog": {
652 |             "act_code": 554,
653 |             "ext": "seqid:1156122784|type:2|t:|pos:1-0-8|q:\u7279\u6717\u666e|ext:&mid=4172475753152500&",
654 |             "luicode": "",
655 |             "uicode": "",
656 |             "fid": "100103type=2&q=\u7279\u6717\u666e"
657 |           },
658 |           "display_arrow": 0,
659 |           "show_type": 1,
660 |           "mblog": {
661 |             "created_at": "7\u5206\u949f\u524d",
662 |             "id": "4172475753152500",
663 |             "mid": "4172475753152500",
664 |             "idstr": "4172475753152500",
665 |             "text": "<a class='k' href='https:\/\/m.weibo.cn\/k\/%E8%BF%91%E6%97%A5%E6%97%B6%E4%BA%8B?from=feed'>#\u8fd1\u65e5\u65f6\u4e8b#<\/a>2017\u5e7411\u67088\u65e5\uff0c\u7f8e\u56fd\u56fd\u5bb6\u603b\u7edf\u7279\u6717\u666e\u53ca\u5176\u592b\u4eba\u4e00\u884c\u4eba\u62b5\u8fbe\u5317\u4eac\uff0c\u8fdb\u884c\u56fd\u4e8b\u8bbf\u95ee\u3002\u8fd9\u6b21\u7279\u6717\u666e\u8bbf\u534e\uff0c\u6709\u7740\u4e09\u4e2a\u7b2c\u4e00\u6b21\uff1a\u8fd9\u662f\u7279\u6717\u666e\u4f5c\u4e3a\u7f8e\u5229\u575a\u5408\u4f17\u56fd\u603b\u7edf\uff0c\u7b2c\u4e00\u6b21\u6765\u4e2d\u56fd\u8bbf\u95ee\u3002\u8fd9\u5e94\u8be5\u4e5f\u662f\u4ed6\u6765\u5230\u4eba\u4e1671\u5e74\u6765\uff0c\u7b2c\u4e00\u6b21\u6765\u4e2d\u56fd\u3002\u8fd9\u4e5f\u662f\u4e2d\u56fd\u5386\u53f2\u6027\u5927\u4f1a\u540e\uff0c\u7b2c\u4e00\u4e2a\u6765\u8bbf\u7684\u5916\u56fd\u56fd\u5bb6\u5143\u9996\u3002\u4e2d\u65b9\u6b64\u524d\u5c31\u8868\u793a\uff0c\u4e2d\u56fd\u5c06\u4ee5 \u200b...<a href=\"\/status\/4172475753152500\">\u5168\u6587<\/a>",
666 |             "textLength": 357,
667 |             "source": "\u5fae\u535a weibo.com",
668 |             "favorited": false,
669 |             "thumbnail_pic": "http:\/\/wx3.sinaimg.cn\/thumbnail\/6feb6459ly1flc60v7f3fj20hi0bsjvl.jpg",
670 |             "bmiddle_pic": "http:\/\/wx3.sinaimg.cn\/bmiddle\/6feb6459ly1flc60v7f3fj20hi0bsjvl.jpg",
671 |             "original_pic": "http:\/\/wx3.sinaimg.cn\/large\/6feb6459ly1flc60v7f3fj20hi0bsjvl.jpg",
672 |             "is_paid": false,
673 |             "mblog_vip_type": 0,
674 |             "user": {
675 |               "id": 1877697625,
676 |               "screen_name": "\u5317\u4eac\u7269\u8d44\u5b66\u9662\u5e7f\u64ad\u53f0",
677 |               "profile_image_url": "https:\/\/tva3.sinaimg.cn\/crop.0.0.180.180.180\/6feb6459jw1e8qgp5bmzyj2050050aa8.jpg",
678 |               "profile_url": "https:\/\/m.weibo.cn\/u\/1877697625?uid=1877697625&luicode=10000011&lfid=100103type%3D2%26q%3D%E7%89%B9%E6%9C%97%E6%99%AE&featurecode=20000320",
679 |               "statuses_count": 4833,
680 |               "verified": true,
681 |               "verified_type": 4,
682 |               "verified_type_ext": 0,
683 |               "verified_reason": "\u5317\u4eac\u7269\u8d44\u5b66\u9662\u6821\u56ed\u5e7f\u64ad\u53f0\u5b98\u65b9\u5fae\u535a",
684 |               "close_blue_v": false,
685 |               "description": "\u5317\u4eac\u7269\u8d44\u5b66\u9662\u5e7f\u64ad\u53f0\u96b6\u5c5e\u4e8e\u515a\u59d4\u5ba3\u4f20\u90e8\u5927\u5b66\u751f\u8bb0\u8005\u56e2\u3002\u5982\u679c\u4f60\u8ba4\u4e3a\u5e7f\u64ad\u53f0\u53ea\u6709\u64ad\u97f3\u3001\u7f16\u8f91\uff0c\u90a3\u4f60\u5c31OUT\u5566\u3002\u7269\u9662\u4e4b\u58f0\u5e7f\u64ad\u53f0\u76ee\u524d\u6709\u516d\u90e8\uff08\u529e\u516c\u5ba4\u3001\u64ad\u97f3\u90e8\u3001\u7f16\u8f91\u90e8\u3001\u5ba3\u4f20\u90e8\u3001\u520a\u7269\u90e8\u3001\u5916\u8054\u90e8\uff09\u4e00\u7ec4\uff08\u82f1\u6587\u7ec4\uff09\u3002",
686 |               "gender": "f",
687 |               "mbtype": 0,
688 |               "urank": 32,
689 |               "mbrank": 0,
690 |               "follow_me": false,
691 |               "following": false,
692 |               "followers_count": 2514,
693 |               "follow_count": 559,
694 |               "cover_image_phone": "https:\/\/tva1.sinaimg.cn\/crop.0.0.640.640.640\/549d0121tw1egm1kjly3jj20hs0hsq4f.jpg",
695 |               "avatar_hd": "https:\/\/ww3.sinaimg.cn\/orj480\/6feb6459jw1e8qgp5bmzyj2050050aa8.jpg"
696 |             },
697 |             "reposts_count": 0,
698 |             "comments_count": 0,
699 |             "attitudes_count": 0,
700 |             "pending_approval_count": 0,
701 |             "isLongText": true,
702 |             "visible": {
703 |               "type": 0,
704 |               "list_id": 0
705 |             },
706 |             "rid": "9_0_0_2676184252325337115",
707 |             "more_info_type": 0,
708 |             "status": 0,
709 |             "itemid": "seqid:1156122784|type:2|t:|pos:1-0-8|q:\u7279\u6717\u666e|ext:&mid=4172475753152500&",
710 |             "bid": "Fupqnde6M",
711 |             "pics": [
712 |               {
713 |                 "pid": "6feb6459ly1flc60v7f3fj20hi0bsjvl",
714 |                 "url": "https:\/\/wx3.sinaimg.cn\/orj360\/6feb6459ly1flc60v7f3fj20hi0bsjvl.jpg",
715 |                 "size": "orj360",
716 |                 "geo": {
717 |                   "width": 401,
718 |                   "height": 270,
719 |                   "croped": false
720 |                 },
721 |                 "large": {
722 |                   "size": "large",
723 |                   "url": "https:\/\/wx3.sinaimg.cn\/large\/6feb6459ly1flc60v7f3fj20hi0bsjvl.jpg",
724 |                   "geo": {
725 |                     "width": "630",
726 |                     "height": "424",
727 |                     "croped": false
728 |                   }
729 |                 }
730 |               },
731 |               {
732 |                 "pid": "6feb6459ly1flc61s7ptqj20b70admyr",
733 |                 "url": "https:\/\/wx1.sinaimg.cn\/orj360\/6feb6459ly1flc61s7ptqj20b70admyr.jpg",
734 |                 "size": "orj360",
735 |                 "geo": {
736 |                   "width": 291,
737 |                   "height": 270,
738 |                   "croped": false
739 |                 },
740 |                 "large": {
741 |                   "size": "large",
742 |                   "url": "https:\/\/wx1.sinaimg.cn\/large\/6feb6459ly1flc61s7ptqj20b70admyr.jpg",
743 |                   "geo": {
744 |                     "width": "403",
745 |                     "height": "373",
746 |                     "croped": false
747 |                   }
748 |                 }
749 |               }
750 |             ]
751 |           },
752 |           "scheme": "https:\/\/m.weibo.cn\/status\/Fupqnde6M?mblogid=Fupqnde6M&luicode=10000011&lfid=100103type%3D2%26q%3D%E7%89%B9%E6%9C%97%E6%99%AE&featurecode=20000320"
753 |         }
754 |       ]
755 |     }
756 |   ],
757 |   "ok": 1,
758 |   "showAppTips": 0,
759 |   "scheme": "sinaweibo:\/\/cardlist?containerid=100103type=2&q=\u7279\u6717\u666e&extparam=&luicode=10000011&lfid=106003type=1&featurecode=20000320"
760 | }
761 | 


--------------------------------------------------------------------------------