├── .gitignore
├── README.md
└── tieba.py


/.gitignore:
--------------------------------------------------------------------------------
1 | *.py
2 | !tieba.py
3 | *.json
4 | *.pyc


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | 使用requests模拟登陆百度贴吧，现在位于另一个项目 [tiankui](https://github.com/shfshanyue/tiankui) 下，作为一个工具使用，最新更新见下
2 | https://github.com/shfshanyue/tiankui/blob/master/app/util/tieba.py
3 | 


--------------------------------------------------------------------------------
/tieba.py:
--------------------------------------------------------------------------------
  1 | # coding: utf-8
  2 | from bs4 import BeautifulSoup
  3 | import requests
  4 | import urllib
  5 | import urllib2
  6 | import json
  7 | import re
  8 | import datetime
  9 | 
 10 | 
 11 | class Post(object):
 12 | 
 13 |     """与需要登陆相关的类，如发帖，签到，
 14 |     """
 15 | 
 16 |     def __init__(self, username, password):
 17 |         self.base_url = 'http://www.baidu.com'
 18 |         self.session = requests.Session()
 19 |         try:
 20 |             self._get_cookies()
 21 |         except IOError as e:
 22 |             print e
 23 |         if self._check_login():
 24 |             print 'from cache...'
 25 |         else:
 26 |             # 防止cookie过期失效
 27 |             self.session.cookies.clear()
 28 |             self.session.get(self.base_url)
 29 |             self.login(username, password)
 30 | 
 31 |     def _get_tbs(self):
 32 |         url_tbs = 'http://tieba.baidu.com/dc/common/tbs'
 33 |         return self.session.get(url_tbs).json()['tbs']
 34 | 
 35 |     def _get_token(self):
 36 |         url_token = 'https://passport.baidu.com/v2/api/?getapi&tpl=pp&apiver=v3&class=login'
 37 |         res = self.session.get(url_token)
 38 |         data = json.loads(res.text.replace('\'', '\"'))
 39 |         token = data['data']['token']
 40 |         return token
 41 | 
 42 |     def _get_cookies(self):
 43 |         """从文本中获得cookie
 44 |         """
 45 |         with open('cookie.json') as f:
 46 |             cookies = json.load(f)
 47 |             self.session.cookies.update(cookies)
 48 | 
 49 |     def _check_login(self):
 50 |         """验证是否登陆成功
 51 | 
 52 |         Returns:
 53 |             Boolean: 是否登陆成功
 54 |         """
 55 |         res = self.session.get(self.base_url)
 56 |         match = re.search(u'个人中心', res.text)
 57 |         if match:
 58 |             return True
 59 |         return False
 60 | 
 61 |     def post(self, content, tid, kw='太原科技大学', fid='266662'):
 62 |         """百度贴吧回复帖子
 63 | 
 64 |         Args:
 65 |             content (str): 回复帖子的内容 
 66 |             tid (str): 回复帖子的ID，http://tieba.baidu.com/p/2674337275，即2674337275
 67 |             kw (str, optional): 吧名，即太原科技大学
 68 |             fid (str, optional): 吧ID
 69 | 
 70 |         Returns:
 71 |             TYPE: 百度贴吧的相应json，err_code可查看是否发送成功
 72 |         """
 73 |         url_post = 'http://tieba.baidu.com/f/commit/post/add'
 74 |         tbs = self._get_tbs()
 75 |         data = {
 76 |             'ie': 'utf-8',
 77 |             'kw': kw,
 78 |             'fid': fid,
 79 |             'tid': tid,
 80 |             'content': content,
 81 |             'is_login': 1,
 82 |             'rich_text': '1',
 83 |             'tbs': tbs,
 84 |             '__type__': 'reply'
 85 |         }
 86 |         headers = {
 87 |             'Host': 'tieba.baidu.com',
 88 |             'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64; rv:42.0) Gecko/20100101 Firefox/42.0',
 89 |             'Accept': 'application/json, text/javascript, */*; q=0.01',
 90 |             'Accept-Language': 'zh-CN,zh;q=0.8,en-US;q=0.5,en;q=0.3',
 91 |             'Accept-Encoding': 'gzip, deflate',
 92 |             'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8',
 93 |             'X-Requested-With': 'XMLHttpRequest',
 94 |             'Pragma': 'no-cache',
 95 |             'Cache-Control': 'no-cache',
 96 |             'DNT': '1'
 97 |         }
 98 |         res = self.session.post(url_post, data=data, headers=headers)
 99 |         return res.json()
100 | 
101 |     def login(self, username, password):
102 |         """登陆百度贴吧，如果登陆成功，保存cookie到json文本，下次登陆可以直接从文本中cookie登陆，无需账号密码
103 | 
104 |         Args:
105 |             username (str): 百度账号
106 |             password (str): 百度账号密码
107 |         """
108 |         url_login = 'https://passport.baidu.com/v2/api/?login'
109 |         data = {
110 |             'username': username,
111 |             'password': password,
112 |             'u': 'https://passport.baidu.com/',
113 |             'tpl': 'pp',
114 |             'token': self._get_token(),
115 |             'staticpage': 'https://passport.baidu.com/static/passpc-account/html/v3Jump.html',
116 |             'isPhone': 'false',
117 |             'charset': 'UTF-8',
118 |             'callback': 'parent.bd__pcbs__ra48vi'
119 |         }
120 | 
121 |         headers = {
122 |             'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64; rv:42.0) Gecko/20100101 Firefox/42.0',
123 |             'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
124 |             'Accept-Language': 'zh-CN,zh;q=0.8,en-US;q=0.5,en;q=0.3',
125 |             'Accept-Encoding': 'gzip, deflate',
126 |             'DNT': '1',
127 |             'Connection': 'keep-alive',
128 |             'Referer': 'https://passport.baidu.com/v2/?login',
129 |         }
130 |         res = self.session.post(
131 |             url_login, data=data, headers=headers)
132 |         if self._check_login():
133 |             with open('cookie.json', 'w') as f:
134 |                 json.dump(self.session.cookies.get_dict(), f)
135 |             print 'login...'
136 |         else:
137 |             print 'password or username error!'
138 | 
139 | 
140 | class TiebaPost(object):
141 | 
142 |     """爬取贴吧单个帖子的所有回复贴
143 | 
144 |     Attributes:
145 |         base (str): 帖子的地址，即http://tieba.baidu.com/p/2674337275
146 |     """
147 | 
148 |     def __init__(self, base):
149 |         self.base = base
150 | 
151 |     def find_page(self, n):
152 |         """爬取第n页的帖子
153 | 
154 |         Args:
155 |             n (Intergerr): 第n页
156 | 
157 |         """
158 |         url = '%s?pn=%d' % (self.base, n)
159 |         html = requests.get(url).text
160 |         soup = BeautifulSoup(html, 'html.parser')
161 |         posts = soup.find_all(class_='l_post')
162 |         for post in posts:
163 |             info = json.loads(post['data-field'])
164 |             info = dict(info['author'], **info['content'])
165 |             info['date'] = datetime.datetime.strptime(
166 |                 info['date'], '%Y-%m-%d %H:%M')
167 |             # 替换掉百度域图片，百度域的图片无法查看
168 |             info['content'] = re.sub(
169 |                 r'<img class="BDE_Image".*?>', '', post.find(class_='j_d_post_content').text)
170 |             yield info
171 |     findPage = find_page
172 | 
173 |     @property
174 |     def max_page(self):
175 |         """获得帖子的页数
176 |         """
177 |         soup = BeautifulSoup(requests.get(self.base).text, 'html.parser')
178 |         page = soup.find('input', id='jumpPage4')['max-page']
179 |         return int(page)
180 | 
181 | 
182 | class Tieba(object):
183 | 
184 |     def __init__(self, pid=0, kw='太原科技大学'):
185 |         self.kw = kw
186 |         self.pid = pid
187 |         self.base_url = 'http://www.baidu.com'
188 | 
189 |     def __getitem__(self, key):
190 |         return self.find_page(key, None)
191 | 
192 |     def find_page(self, page):
193 |         page = page * 50
194 |         url = 'http://tieba.baidu.com/f?kw={0:s}&ie=utf-8&pn={1}'.format(
195 |             self.kw, page)
196 |         soup = BeautifulSoup(requests.get(url).text, 'html.parser')
197 |         j_threads = soup.find_all(class_='j_thread_list')
198 |         for j_thread in j_threads:
199 |             data = json.loads(j_thread['data-field'])
200 |             if data['id'] < self.pid:
201 |                 continue
202 |             data['title'] = j_thread.find('a', class_='j_th_tit').get_text()
203 |             yield data
204 |     findPage = find_page
205 | 


--------------------------------------------------------------------------------