├── README.md ├── zhihuuser ├── __init__.py ├── __pycache__ │ ├── items.cpython-35.pyc │ ├── __init__.cpython-35.pyc │ ├── pipelines.cpython-35.pyc │ └── settings.cpython-35.pyc ├── spiders │ ├── __pycache__ │ │ ├── zhihu.cpython-35.pyc │ │ └── __init__.cpython-35.pyc │ ├── __init__.py │ └── zhihu.py ├── .idea │ ├── modules.xml │ ├── zhihuuser.iml │ ├── misc.xml │ └── workspace.xml ├── pipelines.py ├── items.py └── settings.py ├── entrypoint.py └── scrapy.cfg /README.md: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /zhihuuser/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /entrypoint.py: -------------------------------------------------------------------------------- 1 | from scrapy.cmdline import execute 2 | execute(['scrapy','crawl','zhihu']) -------------------------------------------------------------------------------- /zhihuuser/__pycache__/items.cpython-35.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xiaobeibei26/zhihu_user_spider/HEAD/zhihuuser/__pycache__/items.cpython-35.pyc -------------------------------------------------------------------------------- /zhihuuser/__pycache__/__init__.cpython-35.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xiaobeibei26/zhihu_user_spider/HEAD/zhihuuser/__pycache__/__init__.cpython-35.pyc -------------------------------------------------------------------------------- /zhihuuser/__pycache__/pipelines.cpython-35.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xiaobeibei26/zhihu_user_spider/HEAD/zhihuuser/__pycache__/pipelines.cpython-35.pyc -------------------------------------------------------------------------------- /zhihuuser/__pycache__/settings.cpython-35.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xiaobeibei26/zhihu_user_spider/HEAD/zhihuuser/__pycache__/settings.cpython-35.pyc -------------------------------------------------------------------------------- /zhihuuser/spiders/__pycache__/zhihu.cpython-35.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xiaobeibei26/zhihu_user_spider/HEAD/zhihuuser/spiders/__pycache__/zhihu.cpython-35.pyc -------------------------------------------------------------------------------- /zhihuuser/spiders/__pycache__/__init__.cpython-35.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xiaobeibei26/zhihu_user_spider/HEAD/zhihuuser/spiders/__pycache__/__init__.cpython-35.pyc -------------------------------------------------------------------------------- /zhihuuser/spiders/__init__.py: -------------------------------------------------------------------------------- 1 | # This package will contain the spiders of your Scrapy project 2 | # 3 | # Please refer to the documentation for information on how to create and manage 4 | # your spiders. 5 | -------------------------------------------------------------------------------- /scrapy.cfg: -------------------------------------------------------------------------------- 1 | # Automatically created by: scrapy startproject 2 | # 3 | # For more information about the [deploy] section see: 4 | # https://scrapyd.readthedocs.org/en/latest/deploy.html 5 | 6 | [settings] 7 | default = zhihuuser.settings 8 | 9 | [deploy] 10 | #url = http://localhost:6800/ 11 | project = zhihuuser 12 | -------------------------------------------------------------------------------- /zhihuuser/.idea/modules.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | -------------------------------------------------------------------------------- /zhihuuser/.idea/zhihuuser.iml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 11 | -------------------------------------------------------------------------------- /zhihuuser/pipelines.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define your item pipelines here 4 | # 5 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting 6 | # See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html 7 | from pymongo import MongoClient 8 | 9 | 10 | 11 | class ZhihuuserPipeline(object): 12 | def __init__(self): 13 | self.client = MongoClient() 14 | self.database = self.client['zhuhu_spider'] 15 | self.db = self.database['zhuhu_user_infomation'] 16 | 17 | def process_item(self, item, spider):#这里以每个用户url_token为ID,有则更新,没有则插入 18 | self.db.update({'url_token':item['url_token']},dict(item),True) 19 | return item 20 | 21 | def close_spider(self,spider): 22 | self.client.close() 23 | -------------------------------------------------------------------------------- /zhihuuser/items.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define here the models for your scraped items 4 | # 5 | # See documentation in: 6 | # http://doc.scrapy.org/en/latest/topics/items.html 7 | 8 | import scrapy 9 | 10 | 11 | class ZhihuUserItem(scrapy.Item): 12 | # define the fields for your item here like: 13 | # name = scrapy.Field() 14 | answer_count = scrapy.Field()#回答数量 15 | articles_count = scrapy.Field()#写过的文章数 16 | follower_count = scrapy.Field()#粉丝数量 17 | following_count = scrapy.Field()#关注了多少人 18 | educations=scrapy.Field()#教育背景 19 | description = scrapy.Field()#个人描述 20 | locations = scrapy.Field()#所在地 21 | url_token =scrapy.Field()#知乎给予的每个人用户主页唯一的ID 22 | name=scrapy.Field()#用户昵称 23 | employments = scrapy.Field()#工作信息 24 | business=scrapy.Field()#一些工作或者商业信息的合集 25 | user_type =scrapy.Field()#用户类型,可以是个人,也可以是团体等等 26 | headline =scrapy.Field()#个人主页的标签 27 | voteup_count = scrapy.Field()#获得的赞数 28 | thanked_count=scrapy.Field()#获得的感谢数 29 | favorited_count = scrapy.Field()#被收藏次数 30 | avatar_url = scrapy.Field()#头像URl 31 | -------------------------------------------------------------------------------- /zhihuuser/.idea/misc.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | Python 12 | 13 | 14 | RELAX NG 15 | 16 | 17 | XML 18 | 19 | 20 | 21 | 22 | Buildout 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | -------------------------------------------------------------------------------- /zhihuuser/settings.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Scrapy settings for zhihuuser project 4 | # 5 | # For simplicity, this file contains only settings considered important or 6 | # commonly used. You can find more settings consulting the documentation: 7 | # 8 | # http://doc.scrapy.org/en/latest/topics/settings.html 9 | # http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html 10 | # http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html 11 | 12 | BOT_NAME = 'zhihuuser' 13 | 14 | SPIDER_MODULES = ['zhihuuser.spiders'] 15 | NEWSPIDER_MODULE = 'zhihuuser.spiders' 16 | 17 | 18 | # Crawl responsibly by identifying yourself (and your website) on the user-agent 19 | #USER_AGENT = 'zhihuuser (+http://www.yourdomain.com)' 20 | 21 | # Obey robots.txt rules 22 | ROBOTSTXT_OBEY = False 23 | 24 | # Configure maximum concurrent requests performed by Scrapy (default: 16) 25 | #CONCURRENT_REQUESTS = 32 26 | 27 | # Configure a delay for requests for the same website (default: 0) 28 | # See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay 29 | # See also autothrottle settings and docs 30 | #DOWNLOAD_DELAY = 3 31 | # The download delay setting will honor only one of: 32 | #CONCURRENT_REQUESTS_PER_DOMAIN = 16 33 | #CONCURRENT_REQUESTS_PER_IP = 16 34 | 35 | # Disable cookies (enabled by default) 36 | #COOKIES_ENABLED = False 37 | 38 | # Disable Telnet Console (enabled by default) 39 | #TELNETCONSOLE_ENABLED = False 40 | 41 | # Override the default request headers: 42 | DEFAULT_REQUEST_HEADERS = { 43 | 'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.106 Safari/537.36', 44 | 'authorization':'oauth c3cef7c66a1843f8b3a9e6a1e3160e20', 45 | } 46 | # Enable or disable spider middlewares 47 | # See http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html 48 | #SPIDER_MIDDLEWARES = { 49 | # 'zhihuuser.middlewares.MyCustomSpiderMiddleware': 543, 50 | #} 51 | 52 | # Enable or disable downloader middlewares 53 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html 54 | #DOWNLOADER_MIDDLEWARES = { 55 | # 'zhihuuser.middlewares.MyCustomDownloaderMiddleware': 543, 56 | #} 57 | 58 | # Enable or disable extensions 59 | # See http://scrapy.readthedocs.org/en/latest/topics/extensions.html 60 | #EXTENSIONS = { 61 | # 'scrapy.extensions.telnet.TelnetConsole': None, 62 | #} 63 | 64 | # Configure item pipelines 65 | # See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html 66 | ITEM_PIPELINES = { 67 | 'zhihuuser.pipelines.ZhihuuserPipeline': 300, 68 | } 69 | 70 | # Enable and configure the AutoThrottle extension (disabled by default) 71 | # See http://doc.scrapy.org/en/latest/topics/autothrottle.html 72 | #AUTOTHROTTLE_ENABLED = True 73 | # The initial download delay 74 | #AUTOTHROTTLE_START_DELAY = 5 75 | # The maximum download delay to be set in case of high latencies 76 | #AUTOTHROTTLE_MAX_DELAY = 60 77 | # The average number of requests Scrapy should be sending in parallel to 78 | # each remote server 79 | #AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0 80 | # Enable showing throttling stats for every response received: 81 | #AUTOTHROTTLE_DEBUG = False 82 | 83 | # Enable and configure HTTP caching (disabled by default) 84 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings 85 | #HTTPCACHE_ENABLED = True 86 | #HTTPCACHE_EXPIRATION_SECS = 0 87 | #HTTPCACHE_DIR = 'httpcache' 88 | #HTTPCACHE_IGNORE_HTTP_CODES = [] 89 | #HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage' 90 | -------------------------------------------------------------------------------- /zhihuuser/spiders/zhihu.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import scrapy 3 | from scrapy import Request 4 | import json 5 | from zhihuuser.items import ZhihuUserItem#导入我们刚刚定义的items,这里从文件最外层开始带入 6 | 7 | 8 | 9 | class ZhihuSpider(scrapy.Spider): 10 | name = "zhihu" 11 | allowed_domains = ["www.zhihu.com"] 12 | start_urls = ['http://www.zhihu.com/'] 13 | start_user ='wang-tuan-jie-55'#开始放进去的第一个用户的ID 14 | 15 | 16 | 17 | include_follow='data[*].answer_count, articles_count, gender, follower_count, is_followed, is_following, badge[?(type = best_answerer)].topics' 18 | #上面这个是查询粉丝或者关注列表里面的用户需要附带的参数 19 | include_userinfo='locations,employments,gender,educations,business,voteup_count,thanked_Count,follower_count,following_count,cover_url,following_topic_count,following_question_count,following_favlists_count,following_columns_count,avatar_hue,answer_count,articles_count,pins_count,question_count,commercial_question_count,favorite_count,favorited_count,logs_count,marked_answers_count,marked_answers_text,message_thread_token,account_status,is_active,is_force_renamed,is_bind_sina,sina_weibo_url,sina_weibo_name,show_sina_weibo,is_blocking,is_blocked,is_following,is_followed,mutual_followees_count,vote_to_count,vote_from_count,thank_to_count,thank_from_count,thanked_count,description,hosted_live_count,participated_live_count,allow_message,industry_category,org_name,org_homepage,badge[?(type=best_answerer)].topics' 20 | #上面这个是查询个人信息需要附带的一个参数 21 | 22 | 23 | followers_url = 'https://www.zhihu.com/api/v4/members/{user_name}/followers?include={include_follow}&offset={offset}&limit={limit}' 24 | #获取粉丝列表的url,里面的参数分别是用户的ID,查询参数,这个在浏览器复制就可以了,offset表示第几页的粉丝或者关注者,limit表示每页的数量,这里网页上默认是20 25 | followees_url = 'https://www.zhihu.com/api/v4/members/{user_name}/followees?include={include_follow}&offset={offset}&limit={limit}' 26 | # 获取关注列表的URL,根上面的就差了一个字母 27 | userinfo_url= 'https://www.zhihu.com/api/v4/members/{user_name}?include={include_userinfo}' 28 | #上面这个是提取用户信息信息的url 29 | def start_requests(self): 30 | yield Request(url=self.userinfo_url.format(user_name=self.start_user,include_userinfo=self.include_userinfo),callback=self.get_user_info) 31 | #上面是访问第一个用户,获取详细信息 32 | yield Request(url=self.followers_url.format(user_name=self.start_user,include_follow=self.include_follow,offset=0,limit=20),callback=self.get_followers_parse) 33 | #上面是访问第一个用户的粉丝列表,下面是访问关注列表 34 | yield Request(url=self.followees_url.format(user_name=self.start_user,include_follow=self.include_follow,offset=0,limit=20),callback=self.get_followees_parse) 35 | 36 | def get_user_info(self,response):#获取用户信息信息 37 | data = json.loads(response.text) 38 | #print(data) 39 | item = ZhihuUserItem() 40 | for Field in item.fields:#可以获取在item里面定义的key值,就是那些locations,employments等 41 | #print(Field) 42 | if Field in data.keys(): 43 | item[Field]=data.get(Field)#获取字典里面的值 44 | yield item 45 | yield Request(url=self.followers_url.format(user_name=data.get('url_token'),include_follow=self.include_follow,offset=0,limit=20),callback=self.get_followers_parse) 46 | yield Request(url=self.followees_url.format(user_name=data.get('url_token'), include_follow=self.include_follow, offset=0,limit=20), callback=self.get_followees_parse) 47 | 48 | def get_followers_parse(self, response):#获取粉丝列表 49 | try:#这里添加的异常是防止有些用户没有粉丝 50 | followers_data = json.loads(response.text) 51 | 52 | try: 53 | if followers_data.get('data'): # data里面是一个由字典组成的列表,每个字典是粉丝的相关信息 54 | for one_user in followers_data.get('data'): 55 | user_name = one_user['url_token']#提取url_token然后访问他的详细信息 56 | yield Request(url=self.userinfo_url.format(user_name=user_name,include_userinfo=self.include_userinfo),callback=self.get_user_info) 57 | #将所有粉丝或者关注者的url_token提取出来,放进一开始我们构造的用户详细信息的网址里面,提取他们的信息 58 | 59 | if 'paging' in followers_data.keys() and followers_data.get('paging').get('is_end') ==False: 60 | yield Request(url=followers_data.get('paging').get('next'),callback=self.get_followers_parse) 61 | except Exception as e: 62 | print(e,'该用户没有url_token') 63 | except Exception as e: 64 | print(e,' 该用户没有粉丝') 65 | 66 | def get_followees_parse(self,response):#获取关注者的函数 67 | try:#这里添加的异常是防止有些用户没有关注者 68 | followees_data = json.loads(response.text) 69 | try: 70 | if followees_data.get('data'): 71 | for one_user in followees_data.get('data'): 72 | user_name = one_user['url_token']#提取url_token然后访问他的详细信息 73 | yield Request(url=self.userinfo_url.format(user_name=user_name,include_userinfo=self.include_userinfo),callback=self.get_user_info) 74 | #将所有粉丝或者关注者的url_token提取出来,放进一开始我们构造的用户详细信息的网址里面,提取他们的信息 75 | 76 | if 'paging' in followees_data.keys() and followees_data.get('paging').get('is_end') ==False:#判断是否有下一页 77 | yield Request(url=followees_data.get('paging').get('next'),callback=self.get_followees_parse) 78 | except Exception as e: 79 | print(e,'该用户没有url_token或者data') 80 | except Exception as e: 81 | print(e,' 该用户没有粉丝') 82 | -------------------------------------------------------------------------------- /zhihuuser/.idea/workspace.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 14 | 15 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | 40 | true 41 | DEFINITION_ORDER 42 | 43 | 44 | 49 | 50 | 51 | 52 | 53 | 54 | 55 | 56 | 57 | 58 | 59 | 60 | 61 | 62 | 63 | 64 | 65 | 66 | 67 | 68 | 69 | 70 | 71 | 72 | 73 | 74 | 75 | 76 | 77 | 78 | 81 | 82 | 83 | 84 | 87 | 88 | 91 | 92 | 93 | 94 | 97 | 98 | 101 | 102 | 105 | 106 | 107 | 108 | 109 | 110 | 111 | 112 | 113 | 114 | 115 | 116 | 117 | 118 | 119 | 121 | 122 | 123 | 124 | 1491719046806 125 | 129 | 130 | 131 | 132 | 133 | 134 | 135 | 136 | 137 | 138 | 139 | 140 | 141 | 142 | 143 | 144 | 145 | 146 | 147 | 148 | 149 | 150 | 151 | 152 | 153 | 154 | 155 | 156 | 157 | 160 | 163 | 164 | 165 | 167 | 168 | 169 | 170 | 171 | 172 | 173 | 174 | 175 | 176 | 177 | 178 | 179 | 180 | 181 | 182 | 183 | --------------------------------------------------------------------------------