├── README.md ├── index.py └── python_zhihu.py /README.md: -------------------------------------------------------------------------------- 1 | # python_zhihu 2 | 一个知乎爬虫(最新),模拟登录,获取答案, 3 | 4 | python环境:python3.X 5 | 依赖的包:requests 6 | 7 | 8 | 下载文件到某个文件夹,然后运行,根据提示执行程序, 9 | 10 |  11 | 12 | **验证码问题**:程序会下载验证码到文件目录下,你需要手动填写验证码,登录一次之后,会记录你的cookies下次可以直接登录,无需填写密码等,cookies文件也在程序文件目录下。 13 | 14 | 实例化: 15 | 16 | from python_zhihu import ZhiHu 17 | 18 | zh=ZhiHu() 19 | 20 | 下载某个问题下的高赞答案: 21 | 22 | zh.get_answer_text('某问题的url') 23 | > 这个方法会下载某个问题下的高赞文字答案,存储在一个txt文件中 24 | 25 | 下载某个问题下所有的图片: 26 | 27 | zh.get_answer_img('某问题的url') 28 | > 这个方法会下载某个问题下的答案中的所有图片,并且按照回答人的昵称归类 29 | 30 | ## 更新 31 | 通过不同的方式(邮箱或手机号)登陆 32 | 33 | ## 2016-12-14,继续更新 34 | 有同学提交了一点代码,可以自动打开验证码图片,并且可以按赞数排序,棒! 35 | 36 | 后期可能会加上更多功能,敬请期待.... 37 | -------------------------------------------------------------------------------- /index.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | from python_zhihu import ZhiHu 4 | 5 | zh=ZhiHu() 6 | zh.get_answer_img('https://www.zhihu.com/question/34078228') 7 | -------------------------------------------------------------------------------- /python_zhihu.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | #/usr/bin/python 3 | import requests,re,json,time,os,os.path,sys 4 | #显示验证码 5 | from PIL import Image 6 | import traceback 7 | import json 8 | #模拟知乎登陆,主要是获取验证码登陆 9 | _zhihu_url='https://www.zhihu.com' 10 | _captcha_url=_zhihu_url+'/captcha.gif?r=' 11 | _captcha_url_end="&type=login"; 12 | header_data={'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8', 13 | 'Accept-Encoding':'gzip, deflate, sdch, br', 14 | 'Accept-Language':'zh-CN,zh;q=0.8', 15 | 'Connection':'keep-alive', 16 | 'Cache-Control':'max-age=0' 17 | ,'Host':'www.zhihu.com' 18 | ,'Upgrade-Insecure-Requests':'1' 19 | ,'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/36.0.1985.125 Safari/537.36' 20 | 21 | } 22 | 23 | 24 | class ZhiHu(): 25 | 26 | _session=None 27 | 28 | favor_data=100 29 | 30 | def __init__(self): 31 | self.do_first() 32 | def get_captcha(self): 33 | return _captcha_url+str(int(time.time()*1000))+_captcha_url_end 34 | def show_or_save_captcha(self,url): 35 | global _session 36 | r=_session.get(url,headers=header_data,verify=True) 37 | with open("code.gif",'wb') as f: 38 | f.write(r.content) 39 | #显示验证码 40 | try: 41 | print("haha") 42 | im = Image.open("code.gif") 43 | im.show() 44 | except: 45 | print("请打开下载的验证码文件code.gif") 46 | 47 | def input_data(self): 48 | global email 49 | global password 50 | global question_url 51 | self.username=raw_input('请输入用户名:') 52 | self.password=raw_input('请输入密码:') 53 | self.show_or_save_captcha(self.get_captcha()) 54 | self.captcha=raw_input('请输入验证码:') 55 | 56 | 57 | def login(self): 58 | global _session 59 | global header_data 60 | global xsrf 61 | r=_session.get('https://www.zhihu.com',headers=header_data,verify=True) 62 | self.xsrf=re.findall('name="_xsrf" value="([\S\s]*?)"',r.text)[0] 63 | 64 | self.input_data() 65 | #确定用户名类型 66 | if re.search(r'^1\d{10}$', self.username): 67 | _type='phone_num' 68 | _login_type='/login/phone_num' 69 | elif re.search(r'(.+)@(.+)', self.username): 70 | _login_type='/login/email' 71 | _type='email' 72 | else: 73 | print('用户名格式不正确') 74 | sys.exit(1) 75 | 76 | 77 | login_data = {' _xsrf':self.xsrf,_type:self.username,'password':self.password,'rememberme':'true' 78 | ,'captcha':self.captcha} 79 | r=_session.post(_zhihu_url+_login_type,data=login_data,headers=header_data,verify=True) 80 | j=r.json() 81 | c=int(j['r']) 82 | if c==0: 83 | print('sign in successful') 84 | 85 | self.save_cookies() 86 | os.remove("code.gif") 87 | 88 | 89 | else: 90 | print('登陆出现问题。。。。') 91 | 92 | import pickle,json 93 | def save_cookies(self): 94 | global _session,path_for 95 | with open('./'+"cookiefile",'w')as f: 96 | json.dump(_session.cookies.get_dict(),f) 97 | #_session.cookies.save() 98 | 99 | def read_cookies(self): 100 | global _session,path_for 101 | #_session.cookies.load() 102 | #_session.headers.update(header_data) 103 | with open('./'+'cookiefile')as f: 104 | cookie=json.load(f) 105 | _session.cookies.update(cookie) 106 | 107 | def get_answer_text(self,url,answers=15): 108 | global _session 109 | global favor_data 110 | r=_session.get(url,headers=header_data,verify=True) 111 | pat=re.compile('"count">[\s]*?(.*?)') 112 | 113 | _list=re.findall(pat,r.text) 114 | #print(_list); 115 | #favor_list=[int(k) for k in _list] 116 | favor_list=[] 117 | #下面主要是将以“K”为单位的赞同数转化为数字 118 | for i in _list: 119 | if 'K' in i: 120 | #print('k in'+i) 121 | i = i.replace('K','000') 122 | favor_list.append(int(i)) 123 | else: 124 | #print(i) 125 | favor_list.append(int(i)) 126 | 127 | favor_list.sort(reverse=True) 128 | if len(favor_list)>=answers: 129 | favor_data=favor_list[answers-1] 130 | else: 131 | favor_data=0 132 | self.save_text(r) 133 | 134 | def get_answer_img(self,url): 135 | global _session 136 | r=_session.get(url,headers=header_data,verify=True).text 137 | item_pattern=re.compile('