├── .gitignore ├── README.md ├── scrapy.cfg └── zhihuuser ├── __init__.py ├── items.py ├── middlewares.py ├── pipelines.py ├── settings.py └── spiders ├── __init__.py └── zhihu.py /.gitignore: -------------------------------------------------------------------------------- 1 | /.idea 2 | *.pyc -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Zhihu 2 | Zhihu User Spider 3 | -------------------------------------------------------------------------------- /scrapy.cfg: -------------------------------------------------------------------------------- 1 | # Automatically created by: scrapy startproject 2 | # 3 | # For more information about the [deploy] section see: 4 | # https://scrapyd.readthedocs.org/en/latest/deploy.html 5 | 6 | [settings] 7 | default = zhihuuser.settings 8 | 9 | [deploy] 10 | #url = http://localhost:6800/ 11 | project = zhihuuser 12 | -------------------------------------------------------------------------------- /zhihuuser/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Germey/Zhihu/b0dbb7389d1292114f74c24ad971b9250de4dc69/zhihuuser/__init__.py -------------------------------------------------------------------------------- /zhihuuser/items.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define here the models for your scraped items 4 | # 5 | # See documentation in: 6 | # http://doc.scrapy.org/en/latest/topics/items.html 7 | 8 | from scrapy import Item, Field 9 | 10 | 11 | class UserItem(Item): 12 | # define the fields for your item here like: 13 | id = Field() 14 | name = Field() 15 | avatar_url = Field() 16 | headline = Field() 17 | description = Field() 18 | url = Field() 19 | url_token = Field() 20 | gender = Field() 21 | cover_url = Field() 22 | type = Field() 23 | badge = Field() 24 | 25 | answer_count = Field() 26 | articles_count = Field() 27 | commercial_question_count = Field() 28 | favorite_count = Field() 29 | favorited_count = Field() 30 | follower_count = Field() 31 | following_columns_count = Field() 32 | following_count = Field() 33 | pins_count = Field() 34 | question_count = Field() 35 | thank_from_count = Field() 36 | thank_to_count = Field() 37 | thanked_count = Field() 38 | vote_from_count = Field() 39 | vote_to_count = Field() 40 | voteup_count = Field() 41 | following_favlists_count = Field() 42 | following_question_count = Field() 43 | following_topic_count = Field() 44 | marked_answers_count = Field() 45 | mutual_followees_count = Field() 46 | hosted_live_count = Field() 47 | participated_live_count = Field() 48 | 49 | locations = Field() 50 | educations = Field() 51 | employments = Field() 52 | 53 | -------------------------------------------------------------------------------- /zhihuuser/middlewares.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define here the models for your spider middleware 4 | # 5 | # See documentation in: 6 | # http://doc.scrapy.org/en/latest/topics/spider-middleware.html 7 | 8 | from scrapy import signals 9 | 10 | 11 | class ZhihuSpiderMiddleware(object): 12 | # Not all methods need to be defined. If a method is not defined, 13 | # scrapy acts as if the spider middleware does not modify the 14 | # passed objects. 15 | 16 | @classmethod 17 | def from_crawler(cls, crawler): 18 | # This method is used by Scrapy to create your spiders. 19 | s = cls() 20 | crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) 21 | return s 22 | 23 | def process_spider_input(response, spider): 24 | # Called for each response that goes through the spider 25 | # middleware and into the spider. 26 | 27 | # Should return None or raise an exception. 28 | return None 29 | 30 | def process_spider_output(response, result, spider): 31 | # Called with the results returned from the Spider, after 32 | # it has processed the response. 33 | 34 | # Must return an iterable of Request, dict or Item objects. 35 | for i in result: 36 | yield i 37 | 38 | def process_spider_exception(response, exception, spider): 39 | # Called when a spider or process_spider_input() method 40 | # (from other spider middleware) raises an exception. 41 | 42 | # Should return either None or an iterable of Response, dict 43 | # or Item objects. 44 | pass 45 | 46 | def process_start_requests(start_requests, spider): 47 | # Called with the start requests of the spider, and works 48 | # similarly to the process_spider_output() method, except 49 | # that it doesn’t have a response associated. 50 | 51 | # Must return only requests (not items). 52 | for r in start_requests: 53 | yield r 54 | 55 | def spider_opened(self, spider): 56 | spider.logger.info('Spider opened: %s' % spider.name) 57 | -------------------------------------------------------------------------------- /zhihuuser/pipelines.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define your item pipelines here 4 | # 5 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting 6 | # See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html 7 | import pymongo 8 | 9 | 10 | class ZhihuPipeline(object): 11 | def process_item(self, item, spider): 12 | return item 13 | 14 | 15 | class MongoPipeline(object): 16 | collection_name = 'users' 17 | 18 | def __init__(self, mongo_uri, mongo_db): 19 | self.mongo_uri = mongo_uri 20 | self.mongo_db = mongo_db 21 | 22 | @classmethod 23 | def from_crawler(cls, crawler): 24 | return cls( 25 | mongo_uri=crawler.settings.get('MONGO_URI'), 26 | mongo_db=crawler.settings.get('MONGO_DATABASE') 27 | ) 28 | 29 | def open_spider(self, spider): 30 | self.client = pymongo.MongoClient(self.mongo_uri) 31 | self.db = self.client[self.mongo_db] 32 | 33 | def close_spider(self, spider): 34 | self.client.close() 35 | 36 | def process_item(self, item, spider): 37 | self.db[self.collection_name].update({'url_token': item['url_token']}, dict(item), True) 38 | return item 39 | -------------------------------------------------------------------------------- /zhihuuser/settings.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Scrapy settings for zhihuuser project 4 | # 5 | # For simplicity, this file contains only settings considered important or 6 | # commonly used. You can find more settings consulting the documentation: 7 | # 8 | # http://doc.scrapy.org/en/latest/topics/settings.html 9 | # http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html 10 | # http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html 11 | 12 | BOT_NAME = 'zhihuuser' 13 | 14 | SPIDER_MODULES = ['zhihuuser.spiders'] 15 | NEWSPIDER_MODULE = 'zhihuuser.spiders' 16 | 17 | # Crawl responsibly by identifying yourself (and your website) on the user-agent 18 | # USER_AGENT = 'zhihu (+http://www.yourdomain.com)' 19 | 20 | # Obey robots.txt rules 21 | ROBOTSTXT_OBEY = False 22 | 23 | # Configure maximum concurrent requests performed by Scrapy (default: 16) 24 | # CONCURRENT_REQUESTS = 32 25 | 26 | # Configure a delay for requests for the same website (default: 0) 27 | # See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay 28 | # See also autothrottle settings and docs 29 | # DOWNLOAD_DELAY = 3 30 | # The download delay setting will honor only one of: 31 | # CONCURRENT_REQUESTS_PER_DOMAIN = 16 32 | # CONCURRENT_REQUESTS_PER_IP = 16 33 | 34 | # Disable cookies (enabled by default) 35 | # COOKIES_ENABLED = False 36 | 37 | # Disable Telnet Console (enabled by default) 38 | # TELNETCONSOLE_ENABLED = False 39 | 40 | # Override the default request headers: 41 | DEFAULT_REQUEST_HEADERS = { 42 | 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36', 43 | 'authorization': 'oauth c3cef7c66a1843f8b3a9e6a1e3160e20', 44 | } 45 | 46 | # Enable or disable spider middlewares 47 | # See http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html 48 | # SPIDER_MIDDLEWARES = { 49 | # 'zhihuuser.middlewares.ZhihuSpiderMiddleware': 543, 50 | # } 51 | 52 | # SPIDER_MIDDLEWARES = { 53 | # 'scrapy_splash.SplashDeduplicateArgsMiddleware': 100, 54 | # } 55 | 56 | # Enable or disable downloader middlewares 57 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html 58 | # DOWNLOADER_MIDDLEWARES = { 59 | # 'zhihuuser.middlewares.MyCustomDownloaderMiddleware': 543, 60 | # } 61 | 62 | # DOWNLOADER_MIDDLEWARES = { 63 | # 'scrapy_splash.SplashCookiesMiddleware': 723, 64 | # 'scrapy_splash.SplashMiddleware': 725, 65 | # 'scrapy.downloadermiddlewares.httpcompression.HttpCompressionMiddleware': 810, 66 | # } 67 | 68 | # Enable or disable extensions 69 | # See http://scrapy.readthedocs.org/en/latest/topics/extensions.html 70 | # EXTENSIONS = { 71 | # 'scrapy.extensions.telnet.TelnetConsole': None, 72 | # } 73 | 74 | # Configure item pipelines 75 | # See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html 76 | ITEM_PIPELINES = { 77 | 'zhihuuser.pipelines.MongoPipeline': 300, 78 | # 'scrapy_redis.pipelines.RedisPipeline': 301 79 | } 80 | 81 | # Enable and configure the AutoThrottle extension (disabled by default) 82 | # See http://doc.scrapy.org/en/latest/topics/autothrottle.html 83 | # AUTOTHROTTLE_ENABLED = True 84 | # The initial download delay 85 | # AUTOTHROTTLE_START_DELAY = 5 86 | # The maximum download delay to be set in case of high latencies 87 | # AUTOTHROTTLE_MAX_DELAY = 60 88 | # The average number of requests Scrapy should be sending in parallel to 89 | # each remote server 90 | # AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0 91 | # Enable showing throttling stats for every response received: 92 | # AUTOTHROTTLE_DEBUG = False 93 | 94 | # Enable and configure HTTP caching (disabled by default) 95 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings 96 | # HTTPCACHE_ENABLED = True 97 | # HTTPCACHE_EXPIRATION_SECS = 0 98 | # HTTPCACHE_DIR = 'httpcache' 99 | # HTTPCACHE_IGNORE_HTTP_CODES = [] 100 | # HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage' 101 | 102 | # DUPEFILTER_CLASS = 'scrapy_splash.SplashAwareDupeFilter' 103 | # HTTPCACHE_STORAGE = 'scrapy_splash.SplashAwareFSCacheStorage' 104 | 105 | # SPLASH_URL = 'http://192.168.99.100:8050' 106 | 107 | MONGO_URI = 'localhost' 108 | MONGO_DATABASE = 'zhihu' 109 | 110 | # SCHEDULER = "scrapy_redis.scheduler.Scheduler" 111 | 112 | # DUPEFILTER_CLASS = "scrapy_redis.dupefilter.RFPDupeFilter" 113 | 114 | # SCHEDULER_FLUSH_ON_START = True 115 | -------------------------------------------------------------------------------- /zhihuuser/spiders/__init__.py: -------------------------------------------------------------------------------- 1 | # This package will contain the spiders of your Scrapy project 2 | # 3 | # Please refer to the documentation for information on how to create and manage 4 | # your spiders. 5 | -------------------------------------------------------------------------------- /zhihuuser/spiders/zhihu.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import json 3 | 4 | from scrapy import Spider, Request 5 | from zhihuuser.items import UserItem 6 | 7 | 8 | class ZhihuSpider(Spider): 9 | name = "zhihu" 10 | allowed_domains = ["www.zhihu.com"] 11 | user_url = 'https://www.zhihu.com/api/v4/members/{user}?include={include}' 12 | follows_url = 'https://www.zhihu.com/api/v4/members/{user}/followees?include={include}&offset={offset}&limit={limit}' 13 | followers_url = 'https://www.zhihu.com/api/v4/members/{user}/followers?include={include}&offset={offset}&limit={limit}' 14 | start_user = 'excited-vczh' 15 | user_query = 'locations,employments,gender,educations,business,voteup_count,thanked_Count,follower_count,following_count,cover_url,following_topic_count,following_question_count,following_favlists_count,following_columns_count,answer_count,articles_count,pins_count,question_count,commercial_question_count,favorite_count,favorited_count,logs_count,marked_answers_count,marked_answers_text,message_thread_token,account_status,is_active,is_force_renamed,is_bind_sina,sina_weibo_url,sina_weibo_name,show_sina_weibo,is_blocking,is_blocked,is_following,is_followed,mutual_followees_count,vote_to_count,vote_from_count,thank_to_count,thank_from_count,thanked_count,description,hosted_live_count,participated_live_count,allow_message,industry_category,org_name,org_homepage,badge[?(type=best_answerer)].topics' 16 | follows_query = 'data[*].answer_count,articles_count,gender,follower_count,is_followed,is_following,badge[?(type=best_answerer)].topics' 17 | followers_query = 'data[*].answer_count,articles_count,gender,follower_count,is_followed,is_following,badge[?(type=best_answerer)].topics' 18 | 19 | def start_requests(self): 20 | yield Request(self.user_url.format(user=self.start_user, include=self.user_query), self.parse_user) 21 | yield Request(self.follows_url.format(user=self.start_user, include=self.follows_query, limit=20, offset=0), 22 | self.parse_follows) 23 | yield Request(self.followers_url.format(user=self.start_user, include=self.followers_query, limit=20, offset=0), 24 | self.parse_followers) 25 | 26 | def parse_user(self, response): 27 | result = json.loads(response.text) 28 | item = UserItem() 29 | 30 | for field in item.fields: 31 | if field in result.keys(): 32 | item[field] = result.get(field) 33 | yield item 34 | 35 | yield Request( 36 | self.follows_url.format(user=result.get('url_token'), include=self.follows_query, limit=20, offset=0), 37 | self.parse_follows) 38 | 39 | yield Request( 40 | self.followers_url.format(user=result.get('url_token'), include=self.followers_query, limit=20, offset=0), 41 | self.parse_followers) 42 | 43 | def parse_follows(self, response): 44 | results = json.loads(response.text) 45 | 46 | if 'data' in results.keys(): 47 | for result in results.get('data'): 48 | yield Request(self.user_url.format(user=result.get('url_token'), include=self.user_query), 49 | self.parse_user) 50 | 51 | if 'paging' in results.keys() and results.get('paging').get('is_end') == False: 52 | next_page = results.get('paging').get('next') 53 | yield Request(next_page, 54 | self.parse_follows) 55 | 56 | def parse_followers(self, response): 57 | results = json.loads(response.text) 58 | 59 | if 'data' in results.keys(): 60 | for result in results.get('data'): 61 | yield Request(self.user_url.format(user=result.get('url_token'), include=self.user_query), 62 | self.parse_user) 63 | 64 | if 'paging' in results.keys() and results.get('paging').get('is_end') == False: 65 | next_page = results.get('paging').get('next') 66 | yield Request(next_page, 67 | self.parse_followers) 68 | --------------------------------------------------------------------------------