├── .gitignore ├── README.md ├── analyze_page ├── cookies ├── getCollections.py ├── getContent.py ├── login_fetch.py └── next_page.py /.gitignore: -------------------------------------------------------------------------------- 1 | *.pyc 2 | /.idea 3 | /content 4 | /content_v1 5 | /content_v2 6 | data.cfg 7 | cookies 8 | /collections 9 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # zhihuToKindle 2 | #版本更新 2016-08-18 3 | #模拟知乎登录,获取自己关注的问题答案 4 | # 修改代码里面user 和 password 然后直接运行python login_fetch 5 | # 如遇到需要验证码时,打开本目录下的code.jpg文件,然后输入验证码即可。 6 | 7 | #推送知乎文章到kindle电子书 8 | 9 | 10 | #使用: python zhihu.py id 11 | id为问题的id: https://www.zhihu.com/question/35461941, id为35461941 12 | -------------------------------------------------------------------------------- /analyze_page: -------------------------------------------------------------------------------- 1 |
14 | 15 | 16 | 17 | 18 | 19 | 20 |
21 | 26 | 30 |
31 | 32 | 33 |
34 |
35 | 36 | 43 | 44 | 45 | 46 | 47 | 剑飞来自火星 51 | PhD student/Wizard/Swimmer 52 | 53 | 54 |
55 |
56 | 57 | 58 | 59 | 60 |
61 |
62 | 78 | 79 |
80 |
81 | 82 | 83 | 84 | 85 | 添加评论 86 | 87 | 88 | 感谢 89 | 90 | 91 | 92 | 分享 93 | 94 | 收藏 95 | 96 | 97 | 98 | 99 | 100 | 101 | 102 | 没有帮助 103 | 104 | 105 | 举报 106 | 107 | 108 | 109 | 110 | 111 | 112 | 113 | 114 | 115 | 116 | 作者保留权利 117 | 118 | 119 | 120 |
121 |
122 |
-------------------------------------------------------------------------------- /cookies: -------------------------------------------------------------------------------- 1 | #LWP-Cookies-2.0 2 | Set-Cookie3: cap_id="\"NWZjZDg5MzdkMTRkNDMyZDgyMmM2ZjlkNWJiYjIzNmY=|1487004386|60d8677cd71b8a9b6b4baa3a8d75ee9c346055ed\""; path="/"; domain=".zhihu.com"; path_spec; expires="2017-03-15 16:46:26Z"; version=0 3 | Set-Cookie3: l_cap_id="\"NGZjMGRjMzRjMTQyNDExN2IxNGQ4MGRiMjU0MWJhZTE=|1487004386|b6ce9f53125c1f9fc8a38346c4310a2f293eb9b8\""; path="/"; domain=".zhihu.com"; path_spec; expires="2017-03-15 16:46:26Z"; version=0 4 | Set-Cookie3: login="\"NDhkMzMzYWUyNDFkNDc3Njg4OTUzYTgwZjgxZTRkNDk=|1487347452|31d09a8382f81fcdaea1d60dab55fa57bf570343\""; path="/"; domain=".zhihu.com"; path_spec; expires="2017-03-19 16:04:12Z"; version=0 5 | Set-Cookie3: q_c1="6e19826dd25e45b2be62a33f33f6c84d|1487004386000|1483776850000"; path="/"; domain=".zhihu.com"; path_spec; expires="2020-02-13 16:46:26Z"; version=0 6 | Set-Cookie3: z_c0="\"QUFCQXdHSTdBQUFYQUFBQVlRSlZUZnlyemxnV3NaYnV4c0ZrT3ZCSElxb3E2eUNyNUw4MlpRPT0=|1487347452|5e36da333574bb63825b5fca9305aa2db5f8940f\""; path="/"; domain=".zhihu.com"; path_spec; expires="2017-03-19 16:04:12Z"; httponly=None; version=0 7 | Set-Cookie3: _xsrf=8283b1ead4634053e3b575e3ea0591eb; path="/"; domain="www.zhihu.com"; path_spec; expires="2017-03-15 16:47:02Z"; version=0 8 | -------------------------------------------------------------------------------- /getCollections.py: -------------------------------------------------------------------------------- 1 | # -*-coding=utf-8-*- 2 | __author__ = 'Rocky' 3 | import requests 4 | import cookielib 5 | import re 6 | import json 7 | import time 8 | from bs4 import BeautifulSoup 9 | import os 10 | from lxml import etree 11 | import codecs 12 | 13 | session = requests.session() 14 | 15 | session.cookies = cookielib.LWPCookieJar(filename="cookies") 16 | agent = 'Mozilla/5.0 (Windows NT 5.1; rv:33.0) Gecko/20100101 Firefox/33.0' 17 | headers = {'Host': 'www.zhihu.com', 18 | 'Referer': 'https://www.zhihu.com', 19 | 'User-Agent': agent} 20 | try: 21 | session.cookies.load(ignore_discard=True) 22 | except: 23 | print "Cookie can't load" 24 | 25 | def getUserData(configFile): 26 | f = open(configFile, 'r') 27 | 28 | #for i in f.readlines(): 29 | data=f.readlines() 30 | username=data[0].strip() 31 | pwd=data[1].strip() 32 | return username,pwd 33 | 34 | def getCaptcha(): 35 | #r=1471341285051 36 | r = (time.time() * 1000) 37 | url = 'http://www.zhihu.com/captcha.gif?r=' + str(r) + '&type=login' 38 | 39 | image = session.get(url, headers=headers) 40 | f = open("photo.jpg", 'wb') 41 | f.write(image.content) 42 | f.close() 43 | def get_xsrf(): 44 | url = 'https://www.zhihu.com' 45 | r = session.get(url, headers=headers, allow_redirects=False) 46 | txt = r.text 47 | #print txt 48 | result = re.findall(r'', txt)[0] 49 | return result 50 | 51 | def Login(): 52 | username,pwd=getUserData("data.cfg") 53 | xsrf = get_xsrf() 54 | print xsrf 55 | print len(xsrf) 56 | login_url = 'https://www.zhihu.com/login/email' 57 | data = { 58 | '_xsrf': xsrf, 59 | 'password': pwd, 60 | 'remember_me': 'true', 61 | 'email': username 62 | } 63 | try: 64 | content = session.post(login_url, data=data, headers=headers) 65 | login_code = content.text 66 | d = json.loads(login_code) 67 | #print d['msg'] 68 | #print content.status_code 69 | #this line important ! if no status, if will fail and execute the except part 70 | #print content.status 71 | 72 | if content.status_code != requests.codes.ok: 73 | print "Need to verification code !" 74 | getCaptcha() 75 | #print "Please input the code of the captcha" 76 | code = raw_input("Please input the code of the captcha") 77 | data['captcha'] = code 78 | content = session.post(login_url, data=data, headers=headers) 79 | print content.status_code 80 | 81 | if content.status_code == requests.codes.ok: 82 | print "Login successful" 83 | session.cookies.save() 84 | #print login_code 85 | else: 86 | session.cookies.save() 87 | return True 88 | except: 89 | print "Error in login" 90 | return False 91 | 92 | def isLogin(): 93 | url = 'https://www.zhihu.com/settings/profile' 94 | login_code = session.get(url, headers=headers, allow_redirects=False).status_code 95 | print login_code 96 | if login_code == 200: 97 | return True 98 | else: 99 | return False 100 | def save2file(filename, content): 101 | # 保存为电子书文件 102 | filename = filename + ".txt" 103 | f = codecs.open(filename, 'a',encoding='utf-8') 104 | f.write(content) 105 | f.close() 106 | 107 | def getAnswer(url): 108 | #这个功能已经实现 109 | html=session.get(url,headers=headers,allow_redirects=False) 110 | s=html.text 111 | 112 | tree=etree.HTML(s) 113 | title=tree.xpath('//title/text()')[0] 114 | 115 | filename_old = title.strip() 116 | filename = re.sub('[\/:*?"<>|]', '-', filename_old) 117 | # 用来保存内容的文件名,因为文件名不能有一些特殊符号,所以使用正则表达式过滤掉 118 | print filename 119 | save2file(filename, title) 120 | 121 | save2file(filename, "\n\n--------------------Link %s ----------------------\n" %url) 122 | save2file(filename, "\n\n--------------------Detail----------------------\n\n") 123 | # 获取问题的补充内容 124 | content=tree.xpath('//div[@class="zm-editable-content clearfix"]') 125 | for i in content: 126 | #print i 127 | text_content=i.xpath("string(.)") 128 | save2file(filename,text_content) 129 | print "Done" 130 | 131 | 132 | def getCollections(): 133 | #实现,获取所有的collection 的link 134 | links=[] 135 | url='https://www.zhihu.com/collections/mine' 136 | Login() 137 | 138 | if isLogin(): 139 | content=session.get(url,headers=headers,allow_redirects=False) 140 | s= content.text 141 | 142 | p=re.compile(r'

\s+') 143 | result=p.findall(s,re.S) 144 | if result is not None: 145 | return result 146 | else: 147 | return None 148 | 149 | def getEachQuestion(url): 150 | s=session.get(url,headers=headers,allow_redirects=False) 151 | tree=etree.HTML(s.text) 152 | result=tree.xpath('//link[@itemprop="url"]/@href') 153 | return result 154 | 155 | 156 | if __name__=='__main__': 157 | sub_folder = os.path.join(os.getcwd(), "collections") 158 | # 专门用于存放下载的电子书的目录 159 | 160 | if not os.path.exists(sub_folder): 161 | os.mkdir(sub_folder) 162 | 163 | os.chdir(sub_folder) 164 | host='https://www.zhihu.com' 165 | collection_link=getCollections() 166 | 167 | for i in collection_link: 168 | print i 169 | 170 | page=1 171 | while 1: 172 | scan_link=collection_url=host+i+'?page=%d' %page 173 | return_content=session.get(scan_link,headers=headers,allow_redirects=False).text 174 | 175 | tree=etree.HTML(return_content) 176 | result=tree.xpath('//link[@itemprop="url"]/@href') 177 | for j in result: 178 | print j 179 | pttrn=re.compile('zhuanlan') 180 | if pttrn.findall(j): 181 | print j 182 | print "skip zhuanlan first" 183 | continue 184 | getAnswer(host+j) 185 | p=re.compile(u'下一页') 186 | if p.search(return_content): 187 | break 188 | p2=re.compile(u'下一页') 189 | if p2.search(return_content) is None: 190 | break 191 | page=page+1 192 | 193 | ''' 194 | collection=['https://www.zhihu.com'+i for i in collection_link] 195 | print collection 196 | ''' 197 | 198 | #url='https://www.zhihu.com/collection/40627095' 199 | #getEachQuestion(url) 200 | #getAnswer('https://www.zhihu.com/question/30348020/answer/144386645') -------------------------------------------------------------------------------- /getContent.py: -------------------------------------------------------------------------------- 1 | # -*-coding=utf-8-*- 2 | __author__ = 'Rocky' 3 | from email.mime.text import MIMEText 4 | from email.mime.multipart import MIMEMultipart 5 | import smtplib 6 | from email import Encoders, Utils 7 | import urllib2 8 | import time 9 | import re 10 | import sys 11 | import os 12 | 13 | from bs4 import BeautifulSoup 14 | 15 | from email.Header import Header 16 | import next_page 17 | reload(sys) 18 | sys.setdefaultencoding('utf-8') 19 | 20 | class GetContent(): 21 | def __init__(self, id): 22 | 23 | # 给出的第一个参数 就是你要下载的问题的id 24 | # 比如 想要下载的问题链接是 https://www.zhihu.com/question/29372574 25 | # 那么 就输入 python zhihu.py 29372574 26 | 27 | 28 | self.getAnswer(id) 29 | 30 | def save2file(self, filename, content): 31 | # 保存为电子书文件 32 | filename = filename + ".txt" 33 | f = open(filename, 'a') 34 | f.write(content) 35 | f.close() 36 | 37 | def getAnswer(self, answerID): 38 | host = "http://www.zhihu.com" 39 | url = host + '/question/'+answerID 40 | print url 41 | user_agent = "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0)" 42 | # 构造header 伪装一下 43 | header = {"User-Agent": user_agent} 44 | req = urllib2.Request(url, headers=header) 45 | 46 | try: 47 | resp = urllib2.urlopen(req,timeout=20) 48 | content= resp.read() 49 | if content is None: 50 | print "Empty" 51 | return False 52 | except: 53 | print "Time out. Retry" 54 | time.sleep(30) 55 | # try to switch with proxy ip 56 | resp = urllib2.urlopen(req,timeout=20) 57 | content = resp.read() 58 | if content is None: 59 | print "Empty" 60 | return False 61 | # 这里已经获取了 网页的代码,接下来就是提取你想要的内容。 使用beautifulSoup 来处理,很方便 62 | try: 63 | bs = BeautifulSoup(content) 64 | 65 | except: 66 | print "Beautifulsoup error" 67 | return False 68 | #print content 69 | title = bs.title 70 | # 获取的标题 71 | print title 72 | if title is None: 73 | print "TITL is empty" 74 | return False 75 | if title.string is None: 76 | print "String is empty" 77 | return False 78 | filename_old = title.string.strip() 79 | print filename_old 80 | filename = re.sub('[\/:*?"<>|]', '-', filename_old) 81 | # 用来保存内容的文件名,因为文件名不能有一些特殊符号,所以使用正则表达式过滤掉 82 | 83 | self.save2file(filename, title.string) 84 | 85 | 86 | detail = bs.find("div", class_="zm-editable-content") 87 | self.save2file(filename, "\n\n\n\n--------------------Link %s ----------------------\n\n" %url) 88 | self.save2file(filename, "\n\n\n\n--------------------Detail----------------------\n\n") 89 | # 获取问题的补充内容 90 | 91 | 92 | if detail is not None: 93 | 94 | for i in detail.strings: 95 | self.save2file(filename, unicode(i)) 96 | ''' 97 | else: 98 | return False 99 | 100 | ''' 101 | 102 | ''' 103 | answer = bs.find_all("div", class_="zm-editable-content clearfix") 104 | k = 0 105 | index = 0 106 | for each_answer in answer: 107 | 108 | self.save2file(filename, -------------------------answer %s via -------------------------\n\ % k) 109 | 110 | for a in each_answer.strings: 111 | # 循环获取每一个答案的内容,然后保存到文件中 112 | self.save2file(filename, unicode(a)) 113 | k += 1 114 | index = index + 1 115 | 116 | ''' 117 | #点击更多按钮的bug 118 | #构造header 119 | new_answer=next_page.getAll_Answer(answerID) 120 | k = 0 121 | index = 0 122 | print new_answer 123 | if new_answer is None: 124 | return 0 125 | for each_answer in new_answer : 126 | self.save2file(filename, "\n\n-------------------------answer %d -------------------------\n" % k) 127 | sub_answer=re.sub('
|
|

|

','\n', each_answer) 128 | self.save2file(filename, unicode(sub_answer)) 129 | k=k+1 130 | 131 | 132 | smtp_server = 'smtp.126.com' 133 | from_mail = 'your@126.com' 134 | password = 'yourpassword' 135 | to_mail = 'yourname@kindle.cn' 136 | 137 | # send_kindle=MailAtt(smtp_server,from_mail,password,to_mail) 138 | # send_kindle.send_txt(filename) 139 | 140 | # 调用发送邮件函数,把电子书发送到你的kindle用户的邮箱账号,这样你的kindle就可以收到电子书啦 141 | print filename 142 | 143 | 144 | 145 | class MailAtt(): 146 | def __init__(self, smtp_server, from_mail, password, to_mail): 147 | self.server = smtp_server 148 | self.username = from_mail.split("@")[0] 149 | self.from_mail = from_mail 150 | self.password = password 151 | self.to_mail = to_mail 152 | 153 | # 初始化邮箱设置 154 | 155 | def send_txt(self, filename): 156 | # 这里发送附件尤其要注意字符编码,当时调试了挺久的,因为收到的文件总是乱码 157 | self.smtp = smtplib.SMTP() 158 | self.smtp.connect(self.server) 159 | self.smtp.login(self.username, self.password) 160 | self.msg = MIMEMultipart() 161 | self.msg['to'] = self.to_mail 162 | self.msg['from'] = self.from_mail 163 | self.msg['Subject'] = "Convert" 164 | self.filename = filename + ".txt" 165 | self.msg['Date'] = Utils.formatdate(localtime=1) 166 | content = open(self.filename.decode('utf-8'), 'rb').read() 167 | # print content 168 | self.att = MIMEText(content, 'base64', 'utf-8') 169 | self.att['Content-Type'] = 'application/octet-stream' 170 | # self.att["Content-Disposition"] = "attachment;filename=\"%s\"" %(self.filename.encode('gb2312')) 171 | self.att["Content-Disposition"] = "attachment;filename=\"%s\"" % Header(self.filename, 'gb2312') 172 | # print self.att["Content-Disposition"] 173 | self.msg.attach(self.att) 174 | 175 | self.smtp.sendmail(self.msg['from'], self.msg['to'], self.msg.as_string()) 176 | self.smtp.quit() 177 | 178 | #Todo 179 | #add id list to database 180 | 181 | if __name__ == "__main__": 182 | 183 | sub_folder = os.path.join(os.getcwd(), "content") 184 | # 专门用于存放下载的电子书的目录 185 | 186 | if not os.path.exists(sub_folder): 187 | os.mkdir(sub_folder) 188 | 189 | os.chdir(sub_folder) 190 | 191 | id="20357585" 192 | #id = sys.argv[1] 193 | # 给出的第一个参数 就是你要下载的问题的id 194 | # 比如 想要下载的问题链接是 https://www.zhihu.com/question/29372574 195 | # 那么 就输入 python zhihu.py 29372574 196 | 197 | 198 | # id_link="/question/"+id 199 | obj = GetContent(id) 200 | # obj.getAnswer(id_link) 201 | 202 | # 调用获取函数 203 | 204 | print "Done" 205 | -------------------------------------------------------------------------------- /login_fetch.py: -------------------------------------------------------------------------------- 1 | # -*-coding=utf-8-*- 2 | ''' 3 | 在当前文件夹下创建data.cfg 4 | 第一行是用户名 5 | 第二行是用户密码 6 | 7 | 例如: 8 | zhangsan 9 | 123456 10 | ''' 11 | 12 | __author__ = 'Rocky' 13 | import requests 14 | import cookielib 15 | import re 16 | import json 17 | import time 18 | import os 19 | from getContent import GetContent 20 | import next_page 21 | 22 | agent = 'Mozilla/5.0 (Windows NT 5.1; rv:33.0) Gecko/20100101 Firefox/33.0' 23 | headers = {'Host': 'www.zhihu.com', 24 | 'Referer': 'https://www.zhihu.com', 25 | 'User-Agent': agent} 26 | 27 | # 全局变量 28 | session = requests.session() 29 | 30 | session.cookies = cookielib.LWPCookieJar(filename="cookies") 31 | 32 | try: 33 | session.cookies.load(ignore_discard=True) 34 | except: 35 | print "Cookie can't load" 36 | 37 | 38 | #读取文件 39 | def getUserData(configFile): 40 | f = open(configFile, 'r') 41 | 42 | #for i in f.readlines(): 43 | data=f.readlines() 44 | username=data[0].strip() 45 | pwd=data[1].strip() 46 | return username,pwd 47 | 48 | 49 | def isLogin(): 50 | url = 'https://www.zhihu.com/settings/profile' 51 | login_code = session.get(url, headers=headers, allow_redirects=False).status_code 52 | print login_code 53 | if login_code == 200: 54 | return True 55 | else: 56 | return False 57 | 58 | 59 | def get_xsrf(): 60 | url = 'https://www.zhihu.com' 61 | r = session.get(url, headers=headers, allow_redirects=False) 62 | txt = r.text 63 | print txt 64 | result = re.findall(r'', txt)[0] 65 | return result 66 | 67 | 68 | def getCaptcha(): 69 | #r=1471341285051 70 | r = (time.time() * 1000) 71 | url = 'http://www.zhihu.com/captcha.gif?r=' + str(r) + '&type=login' 72 | 73 | image = session.get(url, headers=headers) 74 | f = open("photo.jpg", 'wb') 75 | f.write(image.content) 76 | f.close() 77 | 78 | 79 | def Login(): 80 | username,pwd=getUserData("data.cfg") 81 | xsrf = get_xsrf() 82 | print xsrf 83 | print len(xsrf) 84 | login_url = 'https://www.zhihu.com/login/email' 85 | data = { 86 | '_xsrf': xsrf, 87 | 'password': pwd, 88 | 'remember_me': 'true', 89 | 'email': username 90 | } 91 | try: 92 | content = session.post(login_url, data=data, headers=headers) 93 | login_code = content.text 94 | d = json.loads(login_code) 95 | print d['msg'] 96 | print content.status_code 97 | #this line important ! if no status, if will fail and execute the except part 98 | #print content.status 99 | 100 | if content.status_code != requests.codes.ok: 101 | print "Need to verification code !" 102 | getCaptcha() 103 | #print "Please input the code of the captcha" 104 | code = raw_input("Please input the code of the captcha") 105 | data['captcha'] = code 106 | content = session.post(login_url, data=data, headers=headers) 107 | print content.status_code 108 | 109 | if content.status_code == requests.codes.ok: 110 | print "Login successful" 111 | session.cookies.save() 112 | #print login_code 113 | else: 114 | session.cookies.save() 115 | return True 116 | except: 117 | print "Error in login" 118 | return False 119 | 120 | 121 | def focus_question(): 122 | focus_id = [] 123 | url = 'https://www.zhihu.com/question/following' 124 | content = session.get(url, headers=headers) 125 | print content 126 | p = re.compile(r'
') 129 | result = re.findall(pattern, content.text)[0] 130 | print result 131 | for i in id_list: 132 | print i 133 | focus_id.append(i) 134 | 135 | url_next = 'https://www.zhihu.com/node/ProfileFollowedQuestionsV2' 136 | page = 20 137 | offset = 20 138 | end_page = 500 139 | xsrf = re.findall(r'(\d+) 个回答

') 52 | 53 | result=pattern.findall(content) 54 | if len(result)==0: 55 | return 0 56 | #print result 57 | new_result=result[0] 58 | offset=(int(new_result)/10) 59 | return offset 60 | 61 | def getAll_Answer(answerID): 62 | #answerID=50737023 63 | offset=getOffset(answerID) 64 | #print offset 65 | content=[] 66 | lists=[] 67 | p='0: 81 | lists.append(id_list[0]) 82 | ''' 83 | for k in id_list: 84 | print k 85 | #type(temp) 86 | #content.extend(id_list) 87 | ''' 88 | print lists 89 | all_answer=[] 90 | each_answer='https://www.zhihu.com/question/%s/answer/' % str(answerID) 91 | for x in lists: 92 | link=each_answer+x 93 | result=getAnswer(link) 94 | #print result 95 | all_answer.append(result) 96 | 97 | return all_answer 98 | 99 | #print id_list 100 | 101 | 102 | #getAll_Answer(123) 103 | ''' 104 | p='
(.*?)
' 105 | link="https://www.zhihu.com/question/50737023/answer/123268369" 106 | req=urllib2.Request(url=link,headers=header) 107 | content=urllib2.urlopen(req).read() 108 | #print content 109 | s=re.findall(p,content,re.S|re.M) 110 | print s[0] 111 | ''' 112 | ''' 113 | print content[0] 114 | author=re.compile(r'
DWill') 117 | ''' 118 | 119 | def test_link(): 120 | answerID=28672128 121 | lnk="https://www.zhihu.com/question/28672128/answer/52384944" 122 | req=urllib2.Request(url=lnk,headers=header) 123 | content=urllib2.urlopen(req,timeout=20).read() 124 | 125 | print content 126 | p='