├── .gitignore ├── README.md └── tieba.py /.gitignore: -------------------------------------------------------------------------------- 1 | *.py 2 | !tieba.py 3 | *.json 4 | *.pyc -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | 使用requests模拟登陆百度贴吧,现在位于另一个项目 [tiankui](https://github.com/shfshanyue/tiankui) 下,作为一个工具使用,最新更新见下 2 | https://github.com/shfshanyue/tiankui/blob/master/app/util/tieba.py 3 | -------------------------------------------------------------------------------- /tieba.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | from bs4 import BeautifulSoup 3 | import requests 4 | import urllib 5 | import urllib2 6 | import json 7 | import re 8 | import datetime 9 | 10 | 11 | class Post(object): 12 | 13 | """与需要登陆相关的类,如发帖,签到, 14 | """ 15 | 16 | def __init__(self, username, password): 17 | self.base_url = 'http://www.baidu.com' 18 | self.session = requests.Session() 19 | try: 20 | self._get_cookies() 21 | except IOError as e: 22 | print e 23 | if self._check_login(): 24 | print 'from cache...' 25 | else: 26 | # 防止cookie过期失效 27 | self.session.cookies.clear() 28 | self.session.get(self.base_url) 29 | self.login(username, password) 30 | 31 | def _get_tbs(self): 32 | url_tbs = 'http://tieba.baidu.com/dc/common/tbs' 33 | return self.session.get(url_tbs).json()['tbs'] 34 | 35 | def _get_token(self): 36 | url_token = 'https://passport.baidu.com/v2/api/?getapi&tpl=pp&apiver=v3&class=login' 37 | res = self.session.get(url_token) 38 | data = json.loads(res.text.replace('\'', '\"')) 39 | token = data['data']['token'] 40 | return token 41 | 42 | def _get_cookies(self): 43 | """从文本中获得cookie 44 | """ 45 | with open('cookie.json') as f: 46 | cookies = json.load(f) 47 | self.session.cookies.update(cookies) 48 | 49 | def _check_login(self): 50 | """验证是否登陆成功 51 | 52 | Returns: 53 | Boolean: 是否登陆成功 54 | """ 55 | res = self.session.get(self.base_url) 56 | match = re.search(u'个人中心', res.text) 57 | if match: 58 | return True 59 | return False 60 | 61 | def post(self, content, tid, kw='太原科技大学', fid='266662'): 62 | """百度贴吧回复帖子 63 | 64 | Args: 65 | content (str): 回复帖子的内容 66 | tid (str): 回复帖子的ID,http://tieba.baidu.com/p/2674337275,即2674337275 67 | kw (str, optional): 吧名,即太原科技大学 68 | fid (str, optional): 吧ID 69 | 70 | Returns: 71 | TYPE: 百度贴吧的相应json,err_code可查看是否发送成功 72 | """ 73 | url_post = 'http://tieba.baidu.com/f/commit/post/add' 74 | tbs = self._get_tbs() 75 | data = { 76 | 'ie': 'utf-8', 77 | 'kw': kw, 78 | 'fid': fid, 79 | 'tid': tid, 80 | 'content': content, 81 | 'is_login': 1, 82 | 'rich_text': '1', 83 | 'tbs': tbs, 84 | '__type__': 'reply' 85 | } 86 | headers = { 87 | 'Host': 'tieba.baidu.com', 88 | 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64; rv:42.0) Gecko/20100101 Firefox/42.0', 89 | 'Accept': 'application/json, text/javascript, */*; q=0.01', 90 | 'Accept-Language': 'zh-CN,zh;q=0.8,en-US;q=0.5,en;q=0.3', 91 | 'Accept-Encoding': 'gzip, deflate', 92 | 'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8', 93 | 'X-Requested-With': 'XMLHttpRequest', 94 | 'Pragma': 'no-cache', 95 | 'Cache-Control': 'no-cache', 96 | 'DNT': '1' 97 | } 98 | res = self.session.post(url_post, data=data, headers=headers) 99 | return res.json() 100 | 101 | def login(self, username, password): 102 | """登陆百度贴吧,如果登陆成功,保存cookie到json文本,下次登陆可以直接从文本中cookie登陆,无需账号密码 103 | 104 | Args: 105 | username (str): 百度账号 106 | password (str): 百度账号密码 107 | """ 108 | url_login = 'https://passport.baidu.com/v2/api/?login' 109 | data = { 110 | 'username': username, 111 | 'password': password, 112 | 'u': 'https://passport.baidu.com/', 113 | 'tpl': 'pp', 114 | 'token': self._get_token(), 115 | 'staticpage': 'https://passport.baidu.com/static/passpc-account/html/v3Jump.html', 116 | 'isPhone': 'false', 117 | 'charset': 'UTF-8', 118 | 'callback': 'parent.bd__pcbs__ra48vi' 119 | } 120 | 121 | headers = { 122 | 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64; rv:42.0) Gecko/20100101 Firefox/42.0', 123 | 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 124 | 'Accept-Language': 'zh-CN,zh;q=0.8,en-US;q=0.5,en;q=0.3', 125 | 'Accept-Encoding': 'gzip, deflate', 126 | 'DNT': '1', 127 | 'Connection': 'keep-alive', 128 | 'Referer': 'https://passport.baidu.com/v2/?login', 129 | } 130 | res = self.session.post( 131 | url_login, data=data, headers=headers) 132 | if self._check_login(): 133 | with open('cookie.json', 'w') as f: 134 | json.dump(self.session.cookies.get_dict(), f) 135 | print 'login...' 136 | else: 137 | print 'password or username error!' 138 | 139 | 140 | class TiebaPost(object): 141 | 142 | """爬取贴吧单个帖子的所有回复贴 143 | 144 | Attributes: 145 | base (str): 帖子的地址,即http://tieba.baidu.com/p/2674337275 146 | """ 147 | 148 | def __init__(self, base): 149 | self.base = base 150 | 151 | def find_page(self, n): 152 | """爬取第n页的帖子 153 | 154 | Args: 155 | n (Intergerr): 第n页 156 | 157 | """ 158 | url = '%s?pn=%d' % (self.base, n) 159 | html = requests.get(url).text 160 | soup = BeautifulSoup(html, 'html.parser') 161 | posts = soup.find_all(class_='l_post') 162 | for post in posts: 163 | info = json.loads(post['data-field']) 164 | info = dict(info['author'], **info['content']) 165 | info['date'] = datetime.datetime.strptime( 166 | info['date'], '%Y-%m-%d %H:%M') 167 | # 替换掉百度域图片,百度域的图片无法查看 168 | info['content'] = re.sub( 169 | r'', '', post.find(class_='j_d_post_content').text) 170 | yield info 171 | findPage = find_page 172 | 173 | @property 174 | def max_page(self): 175 | """获得帖子的页数 176 | """ 177 | soup = BeautifulSoup(requests.get(self.base).text, 'html.parser') 178 | page = soup.find('input', id='jumpPage4')['max-page'] 179 | return int(page) 180 | 181 | 182 | class Tieba(object): 183 | 184 | def __init__(self, pid=0, kw='太原科技大学'): 185 | self.kw = kw 186 | self.pid = pid 187 | self.base_url = 'http://www.baidu.com' 188 | 189 | def __getitem__(self, key): 190 | return self.find_page(key, None) 191 | 192 | def find_page(self, page): 193 | page = page * 50 194 | url = 'http://tieba.baidu.com/f?kw={0:s}&ie=utf-8&pn={1}'.format( 195 | self.kw, page) 196 | soup = BeautifulSoup(requests.get(url).text, 'html.parser') 197 | j_threads = soup.find_all(class_='j_thread_list') 198 | for j_thread in j_threads: 199 | data = json.loads(j_thread['data-field']) 200 | if data['id'] < self.pid: 201 | continue 202 | data['title'] = j_thread.find('a', class_='j_th_tit').get_text() 203 | yield data 204 | findPage = find_page 205 | --------------------------------------------------------------------------------