├── README.md
├── zhihuuser
├── __init__.py
├── __pycache__
│ ├── items.cpython-35.pyc
│ ├── __init__.cpython-35.pyc
│ ├── pipelines.cpython-35.pyc
│ └── settings.cpython-35.pyc
├── spiders
│ ├── __pycache__
│ │ ├── zhihu.cpython-35.pyc
│ │ └── __init__.cpython-35.pyc
│ ├── __init__.py
│ └── zhihu.py
├── .idea
│ ├── modules.xml
│ ├── zhihuuser.iml
│ ├── misc.xml
│ └── workspace.xml
├── pipelines.py
├── items.py
└── settings.py
├── entrypoint.py
└── scrapy.cfg
/README.md:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/zhihuuser/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/entrypoint.py:
--------------------------------------------------------------------------------
1 | from scrapy.cmdline import execute
2 | execute(['scrapy','crawl','zhihu'])
--------------------------------------------------------------------------------
/zhihuuser/__pycache__/items.cpython-35.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xiaobeibei26/zhihu_user_spider/HEAD/zhihuuser/__pycache__/items.cpython-35.pyc
--------------------------------------------------------------------------------
/zhihuuser/__pycache__/__init__.cpython-35.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xiaobeibei26/zhihu_user_spider/HEAD/zhihuuser/__pycache__/__init__.cpython-35.pyc
--------------------------------------------------------------------------------
/zhihuuser/__pycache__/pipelines.cpython-35.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xiaobeibei26/zhihu_user_spider/HEAD/zhihuuser/__pycache__/pipelines.cpython-35.pyc
--------------------------------------------------------------------------------
/zhihuuser/__pycache__/settings.cpython-35.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xiaobeibei26/zhihu_user_spider/HEAD/zhihuuser/__pycache__/settings.cpython-35.pyc
--------------------------------------------------------------------------------
/zhihuuser/spiders/__pycache__/zhihu.cpython-35.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xiaobeibei26/zhihu_user_spider/HEAD/zhihuuser/spiders/__pycache__/zhihu.cpython-35.pyc
--------------------------------------------------------------------------------
/zhihuuser/spiders/__pycache__/__init__.cpython-35.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xiaobeibei26/zhihu_user_spider/HEAD/zhihuuser/spiders/__pycache__/__init__.cpython-35.pyc
--------------------------------------------------------------------------------
/zhihuuser/spiders/__init__.py:
--------------------------------------------------------------------------------
1 | # This package will contain the spiders of your Scrapy project
2 | #
3 | # Please refer to the documentation for information on how to create and manage
4 | # your spiders.
5 |
--------------------------------------------------------------------------------
/scrapy.cfg:
--------------------------------------------------------------------------------
1 | # Automatically created by: scrapy startproject
2 | #
3 | # For more information about the [deploy] section see:
4 | # https://scrapyd.readthedocs.org/en/latest/deploy.html
5 |
6 | [settings]
7 | default = zhihuuser.settings
8 |
9 | [deploy]
10 | #url = http://localhost:6800/
11 | project = zhihuuser
12 |
--------------------------------------------------------------------------------
/zhihuuser/.idea/modules.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
--------------------------------------------------------------------------------
/zhihuuser/.idea/zhihuuser.iml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
--------------------------------------------------------------------------------
/zhihuuser/pipelines.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | # Define your item pipelines here
4 | #
5 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting
6 | # See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
7 | from pymongo import MongoClient
8 |
9 |
10 |
11 | class ZhihuuserPipeline(object):
12 | def __init__(self):
13 | self.client = MongoClient()
14 | self.database = self.client['zhuhu_spider']
15 | self.db = self.database['zhuhu_user_infomation']
16 |
17 | def process_item(self, item, spider):#这里以每个用户url_token为ID,有则更新,没有则插入
18 | self.db.update({'url_token':item['url_token']},dict(item),True)
19 | return item
20 |
21 | def close_spider(self,spider):
22 | self.client.close()
23 |
--------------------------------------------------------------------------------
/zhihuuser/items.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | # Define here the models for your scraped items
4 | #
5 | # See documentation in:
6 | # http://doc.scrapy.org/en/latest/topics/items.html
7 |
8 | import scrapy
9 |
10 |
11 | class ZhihuUserItem(scrapy.Item):
12 | # define the fields for your item here like:
13 | # name = scrapy.Field()
14 | answer_count = scrapy.Field()#回答数量
15 | articles_count = scrapy.Field()#写过的文章数
16 | follower_count = scrapy.Field()#粉丝数量
17 | following_count = scrapy.Field()#关注了多少人
18 | educations=scrapy.Field()#教育背景
19 | description = scrapy.Field()#个人描述
20 | locations = scrapy.Field()#所在地
21 | url_token =scrapy.Field()#知乎给予的每个人用户主页唯一的ID
22 | name=scrapy.Field()#用户昵称
23 | employments = scrapy.Field()#工作信息
24 | business=scrapy.Field()#一些工作或者商业信息的合集
25 | user_type =scrapy.Field()#用户类型,可以是个人,也可以是团体等等
26 | headline =scrapy.Field()#个人主页的标签
27 | voteup_count = scrapy.Field()#获得的赞数
28 | thanked_count=scrapy.Field()#获得的感谢数
29 | favorited_count = scrapy.Field()#被收藏次数
30 | avatar_url = scrapy.Field()#头像URl
31 |
--------------------------------------------------------------------------------
/zhihuuser/.idea/misc.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 | Python
12 |
13 |
14 | RELAX NG
15 |
16 |
17 | XML
18 |
19 |
20 |
21 |
22 | Buildout
23 |
24 |
25 |
26 |
27 |
28 |
29 |
30 |
31 |
32 |
33 |
34 |
35 |
36 |
37 |
38 |
39 |
--------------------------------------------------------------------------------
/zhihuuser/settings.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | # Scrapy settings for zhihuuser project
4 | #
5 | # For simplicity, this file contains only settings considered important or
6 | # commonly used. You can find more settings consulting the documentation:
7 | #
8 | # http://doc.scrapy.org/en/latest/topics/settings.html
9 | # http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
10 | # http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
11 |
12 | BOT_NAME = 'zhihuuser'
13 |
14 | SPIDER_MODULES = ['zhihuuser.spiders']
15 | NEWSPIDER_MODULE = 'zhihuuser.spiders'
16 |
17 |
18 | # Crawl responsibly by identifying yourself (and your website) on the user-agent
19 | #USER_AGENT = 'zhihuuser (+http://www.yourdomain.com)'
20 |
21 | # Obey robots.txt rules
22 | ROBOTSTXT_OBEY = False
23 |
24 | # Configure maximum concurrent requests performed by Scrapy (default: 16)
25 | #CONCURRENT_REQUESTS = 32
26 |
27 | # Configure a delay for requests for the same website (default: 0)
28 | # See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay
29 | # See also autothrottle settings and docs
30 | #DOWNLOAD_DELAY = 3
31 | # The download delay setting will honor only one of:
32 | #CONCURRENT_REQUESTS_PER_DOMAIN = 16
33 | #CONCURRENT_REQUESTS_PER_IP = 16
34 |
35 | # Disable cookies (enabled by default)
36 | #COOKIES_ENABLED = False
37 |
38 | # Disable Telnet Console (enabled by default)
39 | #TELNETCONSOLE_ENABLED = False
40 |
41 | # Override the default request headers:
42 | DEFAULT_REQUEST_HEADERS = {
43 | 'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.106 Safari/537.36',
44 | 'authorization':'oauth c3cef7c66a1843f8b3a9e6a1e3160e20',
45 | }
46 | # Enable or disable spider middlewares
47 | # See http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
48 | #SPIDER_MIDDLEWARES = {
49 | # 'zhihuuser.middlewares.MyCustomSpiderMiddleware': 543,
50 | #}
51 |
52 | # Enable or disable downloader middlewares
53 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
54 | #DOWNLOADER_MIDDLEWARES = {
55 | # 'zhihuuser.middlewares.MyCustomDownloaderMiddleware': 543,
56 | #}
57 |
58 | # Enable or disable extensions
59 | # See http://scrapy.readthedocs.org/en/latest/topics/extensions.html
60 | #EXTENSIONS = {
61 | # 'scrapy.extensions.telnet.TelnetConsole': None,
62 | #}
63 |
64 | # Configure item pipelines
65 | # See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html
66 | ITEM_PIPELINES = {
67 | 'zhihuuser.pipelines.ZhihuuserPipeline': 300,
68 | }
69 |
70 | # Enable and configure the AutoThrottle extension (disabled by default)
71 | # See http://doc.scrapy.org/en/latest/topics/autothrottle.html
72 | #AUTOTHROTTLE_ENABLED = True
73 | # The initial download delay
74 | #AUTOTHROTTLE_START_DELAY = 5
75 | # The maximum download delay to be set in case of high latencies
76 | #AUTOTHROTTLE_MAX_DELAY = 60
77 | # The average number of requests Scrapy should be sending in parallel to
78 | # each remote server
79 | #AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
80 | # Enable showing throttling stats for every response received:
81 | #AUTOTHROTTLE_DEBUG = False
82 |
83 | # Enable and configure HTTP caching (disabled by default)
84 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
85 | #HTTPCACHE_ENABLED = True
86 | #HTTPCACHE_EXPIRATION_SECS = 0
87 | #HTTPCACHE_DIR = 'httpcache'
88 | #HTTPCACHE_IGNORE_HTTP_CODES = []
89 | #HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
90 |
--------------------------------------------------------------------------------
/zhihuuser/spiders/zhihu.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | import scrapy
3 | from scrapy import Request
4 | import json
5 | from zhihuuser.items import ZhihuUserItem#导入我们刚刚定义的items,这里从文件最外层开始带入
6 |
7 |
8 |
9 | class ZhihuSpider(scrapy.Spider):
10 | name = "zhihu"
11 | allowed_domains = ["www.zhihu.com"]
12 | start_urls = ['http://www.zhihu.com/']
13 | start_user ='wang-tuan-jie-55'#开始放进去的第一个用户的ID
14 |
15 |
16 |
17 | include_follow='data[*].answer_count, articles_count, gender, follower_count, is_followed, is_following, badge[?(type = best_answerer)].topics'
18 | #上面这个是查询粉丝或者关注列表里面的用户需要附带的参数
19 | include_userinfo='locations,employments,gender,educations,business,voteup_count,thanked_Count,follower_count,following_count,cover_url,following_topic_count,following_question_count,following_favlists_count,following_columns_count,avatar_hue,answer_count,articles_count,pins_count,question_count,commercial_question_count,favorite_count,favorited_count,logs_count,marked_answers_count,marked_answers_text,message_thread_token,account_status,is_active,is_force_renamed,is_bind_sina,sina_weibo_url,sina_weibo_name,show_sina_weibo,is_blocking,is_blocked,is_following,is_followed,mutual_followees_count,vote_to_count,vote_from_count,thank_to_count,thank_from_count,thanked_count,description,hosted_live_count,participated_live_count,allow_message,industry_category,org_name,org_homepage,badge[?(type=best_answerer)].topics'
20 | #上面这个是查询个人信息需要附带的一个参数
21 |
22 |
23 | followers_url = 'https://www.zhihu.com/api/v4/members/{user_name}/followers?include={include_follow}&offset={offset}&limit={limit}'
24 | #获取粉丝列表的url,里面的参数分别是用户的ID,查询参数,这个在浏览器复制就可以了,offset表示第几页的粉丝或者关注者,limit表示每页的数量,这里网页上默认是20
25 | followees_url = 'https://www.zhihu.com/api/v4/members/{user_name}/followees?include={include_follow}&offset={offset}&limit={limit}'
26 | # 获取关注列表的URL,根上面的就差了一个字母
27 | userinfo_url= 'https://www.zhihu.com/api/v4/members/{user_name}?include={include_userinfo}'
28 | #上面这个是提取用户信息信息的url
29 | def start_requests(self):
30 | yield Request(url=self.userinfo_url.format(user_name=self.start_user,include_userinfo=self.include_userinfo),callback=self.get_user_info)
31 | #上面是访问第一个用户,获取详细信息
32 | yield Request(url=self.followers_url.format(user_name=self.start_user,include_follow=self.include_follow,offset=0,limit=20),callback=self.get_followers_parse)
33 | #上面是访问第一个用户的粉丝列表,下面是访问关注列表
34 | yield Request(url=self.followees_url.format(user_name=self.start_user,include_follow=self.include_follow,offset=0,limit=20),callback=self.get_followees_parse)
35 |
36 | def get_user_info(self,response):#获取用户信息信息
37 | data = json.loads(response.text)
38 | #print(data)
39 | item = ZhihuUserItem()
40 | for Field in item.fields:#可以获取在item里面定义的key值,就是那些locations,employments等
41 | #print(Field)
42 | if Field in data.keys():
43 | item[Field]=data.get(Field)#获取字典里面的值
44 | yield item
45 | yield Request(url=self.followers_url.format(user_name=data.get('url_token'),include_follow=self.include_follow,offset=0,limit=20),callback=self.get_followers_parse)
46 | yield Request(url=self.followees_url.format(user_name=data.get('url_token'), include_follow=self.include_follow, offset=0,limit=20), callback=self.get_followees_parse)
47 |
48 | def get_followers_parse(self, response):#获取粉丝列表
49 | try:#这里添加的异常是防止有些用户没有粉丝
50 | followers_data = json.loads(response.text)
51 |
52 | try:
53 | if followers_data.get('data'): # data里面是一个由字典组成的列表,每个字典是粉丝的相关信息
54 | for one_user in followers_data.get('data'):
55 | user_name = one_user['url_token']#提取url_token然后访问他的详细信息
56 | yield Request(url=self.userinfo_url.format(user_name=user_name,include_userinfo=self.include_userinfo),callback=self.get_user_info)
57 | #将所有粉丝或者关注者的url_token提取出来,放进一开始我们构造的用户详细信息的网址里面,提取他们的信息
58 |
59 | if 'paging' in followers_data.keys() and followers_data.get('paging').get('is_end') ==False:
60 | yield Request(url=followers_data.get('paging').get('next'),callback=self.get_followers_parse)
61 | except Exception as e:
62 | print(e,'该用户没有url_token')
63 | except Exception as e:
64 | print(e,' 该用户没有粉丝')
65 |
66 | def get_followees_parse(self,response):#获取关注者的函数
67 | try:#这里添加的异常是防止有些用户没有关注者
68 | followees_data = json.loads(response.text)
69 | try:
70 | if followees_data.get('data'):
71 | for one_user in followees_data.get('data'):
72 | user_name = one_user['url_token']#提取url_token然后访问他的详细信息
73 | yield Request(url=self.userinfo_url.format(user_name=user_name,include_userinfo=self.include_userinfo),callback=self.get_user_info)
74 | #将所有粉丝或者关注者的url_token提取出来,放进一开始我们构造的用户详细信息的网址里面,提取他们的信息
75 |
76 | if 'paging' in followees_data.keys() and followees_data.get('paging').get('is_end') ==False:#判断是否有下一页
77 | yield Request(url=followees_data.get('paging').get('next'),callback=self.get_followees_parse)
78 | except Exception as e:
79 | print(e,'该用户没有url_token或者data')
80 | except Exception as e:
81 | print(e,' 该用户没有粉丝')
82 |
--------------------------------------------------------------------------------
/zhihuuser/.idea/workspace.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 |
13 |
14 |
15 |
16 |
17 |
18 |
19 |
20 |
21 |
22 |
23 |
24 |
25 |
26 |
27 |
28 |
29 |
30 |
31 |
32 |
33 |
34 |
35 |
36 |
37 |
38 |
39 |
40 | true
41 | DEFINITION_ORDER
42 |
43 |
44 |
45 |
46 |
47 |
48 |
49 |
50 |
51 |
52 |
53 |
54 |
55 |
56 |
57 |
58 |
59 |
60 |
61 |
62 |
63 |
64 |
65 |
66 |
67 |
68 |
69 |
70 |
71 |
72 |
73 |
74 |
75 |
76 |
77 |
78 |
79 |
80 |
81 |
82 |
83 |
84 |
85 |
86 |
87 |
88 |
89 |
90 |
91 |
92 |
93 |
94 |
95 |
96 |
97 |
98 |
99 |
100 |
101 |
102 |
103 |
104 |
105 |
106 |
107 |
108 |
109 |
110 |
111 |
112 |
113 |
114 |
115 |
116 |
117 |
118 |
119 |
120 |
121 |
122 |
123 |
124 | 1491719046806
125 |
126 |
127 | 1491719046806
128 |
129 |
130 |
131 |
132 |
133 |
134 |
135 |
136 |
137 |
138 |
139 |
140 |
141 |
142 |
143 |
144 |
145 |
146 |
147 |
148 |
149 |
150 |
151 |
152 |
153 |
154 |
155 |
156 |
157 |
160 |
163 |
164 |
165 |
166 |
167 |
168 |
169 |
170 |
171 |
172 |
173 |
174 |
175 |
176 |
177 |
178 |
179 |
180 |
181 |
182 |
183 |
--------------------------------------------------------------------------------