├── .gitignore
├── README.md
├── scrapy.cfg
└── zhihuuser
    ├── __init__.py
    ├── items.py
    ├── middlewares.py
    ├── pipelines.py
    ├── settings.py
    └── spiders
        ├── __init__.py
        └── zhihu.py


/.gitignore:
--------------------------------------------------------------------------------
1 | /.idea
2 | *.pyc


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # Zhihu
2 | Zhihu User Spider
3 | 


--------------------------------------------------------------------------------
/scrapy.cfg:
--------------------------------------------------------------------------------
 1 | # Automatically created by: scrapy startproject
 2 | #
 3 | # For more information about the [deploy] section see:
 4 | # https://scrapyd.readthedocs.org/en/latest/deploy.html
 5 | 
 6 | [settings]
 7 | default = zhihuuser.settings
 8 | 
 9 | [deploy]
10 | #url = http://localhost:6800/
11 | project = zhihuuser
12 | 


--------------------------------------------------------------------------------
/zhihuuser/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Germey/Zhihu/b0dbb7389d1292114f74c24ad971b9250de4dc69/zhihuuser/__init__.py


--------------------------------------------------------------------------------
/zhihuuser/items.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Define here the models for your scraped items
 4 | #
 5 | # See documentation in:
 6 | # http://doc.scrapy.org/en/latest/topics/items.html
 7 | 
 8 | from scrapy import Item, Field
 9 | 
10 | 
11 | class UserItem(Item):
12 |     # define the fields for your item here like:
13 |     id = Field()
14 |     name = Field()
15 |     avatar_url = Field()
16 |     headline = Field()
17 |     description = Field()
18 |     url = Field()
19 |     url_token = Field()
20 |     gender = Field()
21 |     cover_url = Field()
22 |     type = Field()
23 |     badge = Field()
24 | 
25 |     answer_count = Field()
26 |     articles_count = Field()
27 |     commercial_question_count = Field()
28 |     favorite_count = Field()
29 |     favorited_count = Field()
30 |     follower_count = Field()
31 |     following_columns_count = Field()
32 |     following_count = Field()
33 |     pins_count = Field()
34 |     question_count = Field()
35 |     thank_from_count = Field()
36 |     thank_to_count = Field()
37 |     thanked_count = Field()
38 |     vote_from_count = Field()
39 |     vote_to_count = Field()
40 |     voteup_count = Field()
41 |     following_favlists_count = Field()
42 |     following_question_count = Field()
43 |     following_topic_count = Field()
44 |     marked_answers_count = Field()
45 |     mutual_followees_count = Field()
46 |     hosted_live_count = Field()
47 |     participated_live_count = Field()
48 | 
49 |     locations = Field()
50 |     educations = Field()
51 |     employments = Field()
52 | 
53 | 


--------------------------------------------------------------------------------
/zhihuuser/middlewares.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Define here the models for your spider middleware
 4 | #
 5 | # See documentation in:
 6 | # http://doc.scrapy.org/en/latest/topics/spider-middleware.html
 7 | 
 8 | from scrapy import signals
 9 | 
10 | 
11 | class ZhihuSpiderMiddleware(object):
12 |     # Not all methods need to be defined. If a method is not defined,
13 |     # scrapy acts as if the spider middleware does not modify the
14 |     # passed objects.
15 | 
16 |     @classmethod
17 |     def from_crawler(cls, crawler):
18 |         # This method is used by Scrapy to create your spiders.
19 |         s = cls()
20 |         crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
21 |         return s
22 | 
23 |     def process_spider_input(response, spider):
24 |         # Called for each response that goes through the spider
25 |         # middleware and into the spider.
26 | 
27 |         # Should return None or raise an exception.
28 |         return None
29 | 
30 |     def process_spider_output(response, result, spider):
31 |         # Called with the results returned from the Spider, after
32 |         # it has processed the response.
33 | 
34 |         # Must return an iterable of Request, dict or Item objects.
35 |         for i in result:
36 |             yield i
37 | 
38 |     def process_spider_exception(response, exception, spider):
39 |         # Called when a spider or process_spider_input() method
40 |         # (from other spider middleware) raises an exception.
41 | 
42 |         # Should return either None or an iterable of Response, dict
43 |         # or Item objects.
44 |         pass
45 | 
46 |     def process_start_requests(start_requests, spider):
47 |         # Called with the start requests of the spider, and works
48 |         # similarly to the process_spider_output() method, except
49 |         # that it doesn’t have a response associated.
50 | 
51 |         # Must return only requests (not items).
52 |         for r in start_requests:
53 |             yield r
54 | 
55 |     def spider_opened(self, spider):
56 |         spider.logger.info('Spider opened: %s' % spider.name)
57 | 


--------------------------------------------------------------------------------
/zhihuuser/pipelines.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Define your item pipelines here
 4 | #
 5 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting
 6 | # See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
 7 | import pymongo
 8 | 
 9 | 
10 | class ZhihuPipeline(object):
11 |     def process_item(self, item, spider):
12 |         return item
13 | 
14 | 
15 | class MongoPipeline(object):
16 |     collection_name = 'users'
17 | 
18 |     def __init__(self, mongo_uri, mongo_db):
19 |         self.mongo_uri = mongo_uri
20 |         self.mongo_db = mongo_db
21 | 
22 |     @classmethod
23 |     def from_crawler(cls, crawler):
24 |         return cls(
25 |             mongo_uri=crawler.settings.get('MONGO_URI'),
26 |             mongo_db=crawler.settings.get('MONGO_DATABASE')
27 |         )
28 | 
29 |     def open_spider(self, spider):
30 |         self.client = pymongo.MongoClient(self.mongo_uri)
31 |         self.db = self.client[self.mongo_db]
32 | 
33 |     def close_spider(self, spider):
34 |         self.client.close()
35 | 
36 |     def process_item(self, item, spider):
37 |         self.db[self.collection_name].update({'url_token': item['url_token']}, dict(item), True)
38 |         return item
39 | 


--------------------------------------------------------------------------------
/zhihuuser/settings.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | 
  3 | # Scrapy settings for zhihuuser project
  4 | #
  5 | # For simplicity, this file contains only settings considered important or
  6 | # commonly used. You can find more settings consulting the documentation:
  7 | #
  8 | #     http://doc.scrapy.org/en/latest/topics/settings.html
  9 | #     http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
 10 | #     http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
 11 | 
 12 | BOT_NAME = 'zhihuuser'
 13 | 
 14 | SPIDER_MODULES = ['zhihuuser.spiders']
 15 | NEWSPIDER_MODULE = 'zhihuuser.spiders'
 16 | 
 17 | # Crawl responsibly by identifying yourself (and your website) on the user-agent
 18 | # USER_AGENT = 'zhihu (+http://www.yourdomain.com)'
 19 | 
 20 | # Obey robots.txt rules
 21 | ROBOTSTXT_OBEY = False
 22 | 
 23 | # Configure maximum concurrent requests performed by Scrapy (default: 16)
 24 | # CONCURRENT_REQUESTS = 32
 25 | 
 26 | # Configure a delay for requests for the same website (default: 0)
 27 | # See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay
 28 | # See also autothrottle settings and docs
 29 | # DOWNLOAD_DELAY = 3
 30 | # The download delay setting will honor only one of:
 31 | # CONCURRENT_REQUESTS_PER_DOMAIN = 16
 32 | # CONCURRENT_REQUESTS_PER_IP = 16
 33 | 
 34 | # Disable cookies (enabled by default)
 35 | # COOKIES_ENABLED = False
 36 | 
 37 | # Disable Telnet Console (enabled by default)
 38 | # TELNETCONSOLE_ENABLED = False
 39 | 
 40 | # Override the default request headers:
 41 | DEFAULT_REQUEST_HEADERS = {
 42 |     'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36',
 43 |     'authorization': 'oauth c3cef7c66a1843f8b3a9e6a1e3160e20',
 44 | }
 45 | 
 46 | # Enable or disable spider middlewares
 47 | # See http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
 48 | # SPIDER_MIDDLEWARES = {
 49 | #    'zhihuuser.middlewares.ZhihuSpiderMiddleware': 543,
 50 | # }
 51 | 
 52 | # SPIDER_MIDDLEWARES = {
 53 | #     'scrapy_splash.SplashDeduplicateArgsMiddleware': 100,
 54 | # }
 55 | 
 56 | # Enable or disable downloader middlewares
 57 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
 58 | # DOWNLOADER_MIDDLEWARES = {
 59 | #    'zhihuuser.middlewares.MyCustomDownloaderMiddleware': 543,
 60 | # }
 61 | 
 62 | # DOWNLOADER_MIDDLEWARES = {
 63 | #     'scrapy_splash.SplashCookiesMiddleware': 723,
 64 | #     'scrapy_splash.SplashMiddleware': 725,
 65 | #     'scrapy.downloadermiddlewares.httpcompression.HttpCompressionMiddleware': 810,
 66 | # }
 67 | 
 68 | # Enable or disable extensions
 69 | # See http://scrapy.readthedocs.org/en/latest/topics/extensions.html
 70 | # EXTENSIONS = {
 71 | #    'scrapy.extensions.telnet.TelnetConsole': None,
 72 | # }
 73 | 
 74 | # Configure item pipelines
 75 | # See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html
 76 | ITEM_PIPELINES = {
 77 |     'zhihuuser.pipelines.MongoPipeline': 300,
 78 |     # 'scrapy_redis.pipelines.RedisPipeline': 301
 79 | }
 80 | 
 81 | # Enable and configure the AutoThrottle extension (disabled by default)
 82 | # See http://doc.scrapy.org/en/latest/topics/autothrottle.html
 83 | # AUTOTHROTTLE_ENABLED = True
 84 | # The initial download delay
 85 | # AUTOTHROTTLE_START_DELAY = 5
 86 | # The maximum download delay to be set in case of high latencies
 87 | # AUTOTHROTTLE_MAX_DELAY = 60
 88 | # The average number of requests Scrapy should be sending in parallel to
 89 | # each remote server
 90 | # AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
 91 | # Enable showing throttling stats for every response received:
 92 | # AUTOTHROTTLE_DEBUG = False
 93 | 
 94 | # Enable and configure HTTP caching (disabled by default)
 95 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
 96 | # HTTPCACHE_ENABLED = True
 97 | # HTTPCACHE_EXPIRATION_SECS = 0
 98 | # HTTPCACHE_DIR = 'httpcache'
 99 | # HTTPCACHE_IGNORE_HTTP_CODES = []
100 | # HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
101 | 
102 | # DUPEFILTER_CLASS = 'scrapy_splash.SplashAwareDupeFilter'
103 | # HTTPCACHE_STORAGE = 'scrapy_splash.SplashAwareFSCacheStorage'
104 | 
105 | # SPLASH_URL = 'http://192.168.99.100:8050'
106 | 
107 | MONGO_URI = 'localhost'
108 | MONGO_DATABASE = 'zhihu'
109 | 
110 | # SCHEDULER = "scrapy_redis.scheduler.Scheduler"
111 | 
112 | # DUPEFILTER_CLASS = "scrapy_redis.dupefilter.RFPDupeFilter"
113 | 
114 | # SCHEDULER_FLUSH_ON_START = True
115 | 


--------------------------------------------------------------------------------
/zhihuuser/spiders/__init__.py:
--------------------------------------------------------------------------------
1 | # This package will contain the spiders of your Scrapy project
2 | #
3 | # Please refer to the documentation for information on how to create and manage
4 | # your spiders.
5 | 


--------------------------------------------------------------------------------
/zhihuuser/spiders/zhihu.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | import json
 3 | 
 4 | from scrapy import Spider, Request
 5 | from zhihuuser.items import UserItem
 6 | 
 7 | 
 8 | class ZhihuSpider(Spider):
 9 |     name = "zhihu"
10 |     allowed_domains = ["www.zhihu.com"]
11 |     user_url = 'https://www.zhihu.com/api/v4/members/{user}?include={include}'
12 |     follows_url = 'https://www.zhihu.com/api/v4/members/{user}/followees?include={include}&offset={offset}&limit={limit}'
13 |     followers_url = 'https://www.zhihu.com/api/v4/members/{user}/followers?include={include}&offset={offset}&limit={limit}'
14 |     start_user = 'excited-vczh'
15 |     user_query = 'locations,employments,gender,educations,business,voteup_count,thanked_Count,follower_count,following_count,cover_url,following_topic_count,following_question_count,following_favlists_count,following_columns_count,answer_count,articles_count,pins_count,question_count,commercial_question_count,favorite_count,favorited_count,logs_count,marked_answers_count,marked_answers_text,message_thread_token,account_status,is_active,is_force_renamed,is_bind_sina,sina_weibo_url,sina_weibo_name,show_sina_weibo,is_blocking,is_blocked,is_following,is_followed,mutual_followees_count,vote_to_count,vote_from_count,thank_to_count,thank_from_count,thanked_count,description,hosted_live_count,participated_live_count,allow_message,industry_category,org_name,org_homepage,badge[?(type=best_answerer)].topics'
16 |     follows_query = 'data[*].answer_count,articles_count,gender,follower_count,is_followed,is_following,badge[?(type=best_answerer)].topics'
17 |     followers_query = 'data[*].answer_count,articles_count,gender,follower_count,is_followed,is_following,badge[?(type=best_answerer)].topics'
18 | 
19 |     def start_requests(self):
20 |         yield Request(self.user_url.format(user=self.start_user, include=self.user_query), self.parse_user)
21 |         yield Request(self.follows_url.format(user=self.start_user, include=self.follows_query, limit=20, offset=0),
22 |                       self.parse_follows)
23 |         yield Request(self.followers_url.format(user=self.start_user, include=self.followers_query, limit=20, offset=0),
24 |                       self.parse_followers)
25 | 
26 |     def parse_user(self, response):
27 |         result = json.loads(response.text)
28 |         item = UserItem()
29 | 
30 |         for field in item.fields:
31 |             if field in result.keys():
32 |                 item[field] = result.get(field)
33 |         yield item
34 | 
35 |         yield Request(
36 |             self.follows_url.format(user=result.get('url_token'), include=self.follows_query, limit=20, offset=0),
37 |             self.parse_follows)
38 | 
39 |         yield Request(
40 |             self.followers_url.format(user=result.get('url_token'), include=self.followers_query, limit=20, offset=0),
41 |             self.parse_followers)
42 | 
43 |     def parse_follows(self, response):
44 |         results = json.loads(response.text)
45 | 
46 |         if 'data' in results.keys():
47 |             for result in results.get('data'):
48 |                 yield Request(self.user_url.format(user=result.get('url_token'), include=self.user_query),
49 |                               self.parse_user)
50 | 
51 |         if 'paging' in results.keys() and results.get('paging').get('is_end') == False:
52 |             next_page = results.get('paging').get('next')
53 |             yield Request(next_page,
54 |                           self.parse_follows)
55 | 
56 |     def parse_followers(self, response):
57 |         results = json.loads(response.text)
58 | 
59 |         if 'data' in results.keys():
60 |             for result in results.get('data'):
61 |                 yield Request(self.user_url.format(user=result.get('url_token'), include=self.user_query),
62 |                               self.parse_user)
63 | 
64 |         if 'paging' in results.keys() and results.get('paging').get('is_end') == False:
65 |             next_page = results.get('paging').get('next')
66 |             yield Request(next_page,
67 |                           self.parse_followers)
68 | 


--------------------------------------------------------------------------------