├── README.md ├── index.py └── python_zhihu.py /README.md: -------------------------------------------------------------------------------- 1 | # python_zhihu 2 | 一个知乎爬虫(最新),模拟登录,获取答案, 3 | 4 | python环境:python3.X 5 | 依赖的包:requests 6 | 7 | 8 | 下载文件到某个文件夹,然后运行,根据提示执行程序, 9 | 10 | ![](https://github.com/ladingwu/python_zhihu/blob/master/example.jpg) 11 | 12 | **验证码问题**:程序会下载验证码到文件目录下,你需要手动填写验证码,登录一次之后,会记录你的cookies下次可以直接登录,无需填写密码等,cookies文件也在程序文件目录下。 13 | 14 | 实例化: 15 | 16 | from python_zhihu import ZhiHu 17 | 18 | zh=ZhiHu() 19 | 20 | 下载某个问题下的高赞答案: 21 | 22 | zh.get_answer_text('某问题的url') 23 | > 这个方法会下载某个问题下的高赞文字答案,存储在一个txt文件中 24 | 25 | 下载某个问题下所有的图片: 26 | 27 | zh.get_answer_img('某问题的url') 28 | > 这个方法会下载某个问题下的答案中的所有图片,并且按照回答人的昵称归类 29 | 30 | ## 更新 31 | 通过不同的方式(邮箱或手机号)登陆 32 | 33 | ## 2016-12-14,继续更新 34 | 有同学提交了一点代码,可以自动打开验证码图片,并且可以按赞数排序,棒! 35 | 36 | 后期可能会加上更多功能,敬请期待.... 37 | -------------------------------------------------------------------------------- /index.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | from python_zhihu import ZhiHu 4 | 5 | zh=ZhiHu() 6 | zh.get_answer_img('https://www.zhihu.com/question/34078228') 7 | -------------------------------------------------------------------------------- /python_zhihu.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | #/usr/bin/python 3 | import requests,re,json,time,os,os.path,sys 4 | #显示验证码 5 | from PIL import Image 6 | import traceback 7 | import json 8 | #模拟知乎登陆,主要是获取验证码登陆 9 | _zhihu_url='https://www.zhihu.com' 10 | _captcha_url=_zhihu_url+'/captcha.gif?r=' 11 | _captcha_url_end="&type=login"; 12 | header_data={'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8', 13 | 'Accept-Encoding':'gzip, deflate, sdch, br', 14 | 'Accept-Language':'zh-CN,zh;q=0.8', 15 | 'Connection':'keep-alive', 16 | 'Cache-Control':'max-age=0' 17 | ,'Host':'www.zhihu.com' 18 | ,'Upgrade-Insecure-Requests':'1' 19 | ,'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/36.0.1985.125 Safari/537.36' 20 | 21 | } 22 | 23 | 24 | class ZhiHu(): 25 | 26 | _session=None 27 | 28 | favor_data=100 29 | 30 | def __init__(self): 31 | self.do_first() 32 | def get_captcha(self): 33 | return _captcha_url+str(int(time.time()*1000))+_captcha_url_end 34 | def show_or_save_captcha(self,url): 35 | global _session 36 | r=_session.get(url,headers=header_data,verify=True) 37 | with open("code.gif",'wb') as f: 38 | f.write(r.content) 39 | #显示验证码 40 | try: 41 | print("haha") 42 | im = Image.open("code.gif") 43 | im.show() 44 | except: 45 | print("请打开下载的验证码文件code.gif") 46 | 47 | def input_data(self): 48 | global email 49 | global password 50 | global question_url 51 | self.username=raw_input('请输入用户名:') 52 | self.password=raw_input('请输入密码:') 53 | self.show_or_save_captcha(self.get_captcha()) 54 | self.captcha=raw_input('请输入验证码:') 55 | 56 | 57 | def login(self): 58 | global _session 59 | global header_data 60 | global xsrf 61 | r=_session.get('https://www.zhihu.com',headers=header_data,verify=True) 62 | self.xsrf=re.findall('name="_xsrf" value="([\S\s]*?)"',r.text)[0] 63 | 64 | self.input_data() 65 | #确定用户名类型 66 | if re.search(r'^1\d{10}$', self.username): 67 | _type='phone_num' 68 | _login_type='/login/phone_num' 69 | elif re.search(r'(.+)@(.+)', self.username): 70 | _login_type='/login/email' 71 | _type='email' 72 | else: 73 | print('用户名格式不正确') 74 | sys.exit(1) 75 | 76 | 77 | login_data = {' _xsrf':self.xsrf,_type:self.username,'password':self.password,'rememberme':'true' 78 | ,'captcha':self.captcha} 79 | r=_session.post(_zhihu_url+_login_type,data=login_data,headers=header_data,verify=True) 80 | j=r.json() 81 | c=int(j['r']) 82 | if c==0: 83 | print('sign in successful') 84 | 85 | self.save_cookies() 86 | os.remove("code.gif") 87 | 88 | 89 | else: 90 | print('登陆出现问题。。。。') 91 | 92 | import pickle,json 93 | def save_cookies(self): 94 | global _session,path_for 95 | with open('./'+"cookiefile",'w')as f: 96 | json.dump(_session.cookies.get_dict(),f) 97 | #_session.cookies.save() 98 | 99 | def read_cookies(self): 100 | global _session,path_for 101 | #_session.cookies.load() 102 | #_session.headers.update(header_data) 103 | with open('./'+'cookiefile')as f: 104 | cookie=json.load(f) 105 | _session.cookies.update(cookie) 106 | 107 | def get_answer_text(self,url,answers=15): 108 | global _session 109 | global favor_data 110 | r=_session.get(url,headers=header_data,verify=True) 111 | pat=re.compile('"count">[\s]*?(.*?)') 112 | 113 | _list=re.findall(pat,r.text) 114 | #print(_list); 115 | #favor_list=[int(k) for k in _list] 116 | favor_list=[] 117 | #下面主要是将以“K”为单位的赞同数转化为数字 118 | for i in _list: 119 | if 'K' in i: 120 | #print('k in'+i) 121 | i = i.replace('K','000') 122 | favor_list.append(int(i)) 123 | else: 124 | #print(i) 125 | favor_list.append(int(i)) 126 | 127 | favor_list.sort(reverse=True) 128 | if len(favor_list)>=answers: 129 | favor_data=favor_list[answers-1] 130 | else: 131 | favor_data=0 132 | self.save_text(r) 133 | 134 | def get_answer_img(self,url): 135 | global _session 136 | r=_session.get(url,headers=header_data,verify=True).text 137 | item_pattern=re.compile('
') 138 | img_pattern=re.compile('([\s\S]*?)') 140 | #author_pattern=re.compile('([\S\s]*?)') 142 | items=re.findall(item_pattern,r) 143 | title=re.findall(pattern_title,r) 144 | authors=[] 145 | img_list=[] 146 | i=0 147 | try : 148 | 149 | for item in items: 150 | 151 | i+=1 152 | authors.append(re.findall(author_pattern,item)) 153 | 154 | img_list.append(re.findall(img_pattern,item)) 155 | 156 | 157 | except : 158 | print('查找出了一点问题') 159 | traceback.print_exc() 160 | try: 161 | #print(authors) 162 | j=0 163 | for author in authors: 164 | img_urls=img_list[j] 165 | #print(len(img_urls)) 166 | if len(img_urls) == 0: 167 | continue 168 | title_text=title[0]; 169 | author_text='' 170 | if len(author)>0: 171 | author_text=author[0] 172 | path=self.createPathIfNotExist(title_text+'\\'+author[0]) 173 | j+=1 174 | k=0 175 | for url in img_urls: 176 | if 'https' not in url: 177 | #print('坏图:'+url) 178 | continue 179 | print(url) 180 | temp=url.split('.') 181 | suffix='jpg' 182 | if len(temp)>0: 183 | 184 | suffix=temp[len(temp)-1] 185 | #print('suffix= '+suffix) 186 | 187 | k+=1 188 | with open(path+author_text+str(k)+'.'+suffix,'bw')as f: 189 | print('下载第'+str(j)+'个人'+'第'+str(k)+'照片') 190 | f.write(_session.get(url,verify=True).content) 191 | 192 | 193 | 194 | except: 195 | print('下载图片出了一点问题') 196 | traceback.print_exc() 197 | 198 | def createPathIfNotExist(self,path): 199 | root_path=os.path.abspath('.') 200 | p=root_path+'\\'+path+'\\' 201 | if not os.path.exists(p): 202 | os.makedirs(p) 203 | return p 204 | 205 | #def getAnswerByPage(self): 206 | #t='include=data%5B*%5D.is_normal%2Cis_sticky%2Ccollapsed_by%2Csuggest_edit%2Ccomment_count%2Ccollapsed_counts%2Creviewing_comments_count%2Ccan_comment%2Ccontent%2Ceditable_content%2Cvoteup_count%2Creshipment_settings%2Ccomment_permission%2Cmark_infos%2Ccreated_time%2Cupdated_time%2Crelationship.is_author%2Cvoting%2Cis_thanked%2Cis_nothelp%2Cupvoted_followees%3Bdata%5B*%5D.author.is_blocking%2Cis_blocked%2Cis_followed%2Cvoteup_count%2Cmessage_thread_token%2Cbadge%5B%3F(type%3Dbest_answerer)%5D.topics' 207 | #r=_session.get('https://www.zhihu.com/api/v4/questions/'+'48337357'+'/answers?+'+t+'&offset=0&limit=20&sort_by=default',headers=header_data,verify=True) 208 | #jdata=json.loads(r.text); 209 | #return jdata 210 | 211 | 212 | def save_text(self,r): 213 | global path_for 214 | pattern_title=re.compile('([\s\S]*?)') 215 | pattern_desc=re.compile('
([\s\S]*?)
') 216 | pattern_answer=re.compile('[\s]*?([\S]*?)[\s\S]*?
([\s\S]*?)
') 217 | 218 | title=re.findall(pattern_title,r.text) 219 | #print('title:'+title[0]); 220 | desc=re.findall(pattern_desc,r.text) 221 | #print(title,desc) 222 | #a=re.sub(re.compile('
'),'\n',r.text) 223 | answer_favor_list=re.findall(pattern_answer,r.text) 224 | pat_sub=re.compile('
') 225 | with open('./'+title[0]+'.txt','w',encoding='utf-8') as f: 226 | try: 227 | 228 | f.write('问题:'+title[0]+'\n\n') 229 | f.write('描述:'+desc[0]+'\n\n') 230 | #按赞同数多少对答案排序 231 | answer_favor_list = sorted(answer_favor_list, reverse=True, key=self.get_int_list) 232 | for i,answer in enumerate(answer_favor_list): 233 | #print('answer[0]--->'+answer[0]) 234 | if(self.get_int(answer[0])>favor_data): 235 | f.write('\n-------------------''答案'+str(i+1)+'(赞同:'+answer[0]+')''---------------------\n') 236 | f.write('\n答案'+str(i+1)+'(赞同:'+answer[0]+')-->'+re.sub(pat_sub,'\n',answer[1])) 237 | f.write('\n++++++++++++++++++++++++this answer is over++++++++++++++++++++++++++++++') 238 | f.write('\n\n') 239 | except Exception as e: 240 | print('可能在文件读写的时候出了一点问题。。。') 241 | traceback.print_exc() 242 | def get(self,url): 243 | return _session.get(url,headers=header_data,verify=True) 244 | def get_int(self ,s): 245 | if 'K' in s: 246 | return int(s.replace('K','000')) 247 | return int(s) 248 | def get_int_list(self, answer_list): 249 | return self.get_int(answer_list[0]) 250 | def do_first(self): 251 | global _session 252 | _session=requests.session() 253 | if os.path.exists('cookiefile'): 254 | #print('have cookies') 255 | self.read_cookies() 256 | #self.get_text(question_url) 257 | else: 258 | self.login() 259 | 260 | --------------------------------------------------------------------------------