├── .gitignore ├── README.md ├── clean_tieba.py └── requirements.txt /.gitignore: -------------------------------------------------------------------------------- 1 | *.json 2 | *.log 3 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # clean-baidutieba 2 | 删除自己在百度贴吧的发帖和回复 3 | 4 | # requirements 5 | 6 | `sudo pip3 install -r requirements.txt -i https://pypi.mirrors.ustc.edu.cn/simple/` 7 | 8 | 或者 9 | 10 | `sudo pip3 install requests beautifulsoup4 lxml -i https://pypi.mirrors.ustc.edu.cn/simple/` 11 | 12 | # usage 13 | 14 | 一个可选参数 15 | 16 | `-m` `--match`: 用来匹配自己回复的内容,如果匹配到就删,匹配不到就pass,默认值是`.*` 17 | 18 | 一共有3次输入 19 | 20 | 1. 刚开始输入cookies,注意格式: `Cookie: xxx=xxx; ... `直接从chrome里访问以下tieba.baidu.com然后把request headers中view source,把cookie一行复制过来就可以 21 | 22 | 2. 获取发表的帖子和发表的回复前有两次输入,输入的是json文件名,默认程序执行目录,这个是为了让第二次启动这个程序的时候加载json。第一次打开直接按Enter 23 | 24 | 3. 第一次爬完后会有2个xxxfail.json,然后每12h后程序加载这两个文件再次尝试删除,这两个文件每次删除遍历结束会重写 25 | 26 | # todo 27 | 28 | 有个错误码不知什么情况,230308,据说是tbs不对,但是这里获取的tbs从源码里找的。。也不造为啥不对。。 29 | -------------------------------------------------------------------------------- /clean_tieba.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding:utf-8 -*- 3 | import re 4 | import requests 5 | from datetime import datetime 6 | from bs4 import BeautifulSoup 7 | from requests.utils import cookiejar_from_dict 8 | import lxml 9 | import json 10 | import os 11 | import time 12 | from optparse import OptionParser 13 | 14 | 15 | def log(text): 16 | s = '[%s] %s' % (str(datetime.now()), text) 17 | open('clean_tieba.log', 'a').write(s + '\n') 18 | print('[%s] %s' % (str(datetime.now()), text)) 19 | 20 | class Tieba: 21 | user_id = -1 22 | username = '' 23 | match = '.*' 24 | r = requests.Session() 25 | headers = { 26 | 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Safari/537.36', 27 | 'Accept': 'application/json, text/javascript, */*; q=0.01', 28 | 'Referer': 'http://tieba.baidu.com/', 29 | 'X-Requested-With': 'XMLHttpRequest', 30 | 'Accept-Language': 'en-US,en;q=0.9', 31 | 'Cache-Control': 'no-cache', 32 | 'Connection': 'keep-alive', 33 | 'Origin': 'http://tieba.baidu.com', 34 | } 35 | 36 | def error_check(self, text): 37 | log(text) 38 | try: 39 | _ = json.loads(text) 40 | if _['err_code'] == 0: 41 | log('Success') 42 | return True 43 | elif _['err_code'] == 220034: 44 | log('Failed: 您的操作太频繁了') 45 | return 'exit' 46 | elif _['err_code'] == 260005: 47 | log('Failed: Cookies失效') 48 | return False 49 | elif _['err_code'] == 230308: 50 | log('Failed: 据说tbs不对') 51 | return False 52 | else: 53 | log('Failed: 不造啥错误') 54 | return False 55 | 56 | except json.decoder.JSONDecodeError: 57 | return False 58 | 59 | def get_tie(self): 60 | tie_list = [] 61 | page = 1 62 | while(1): 63 | url = 'https://tieba.baidu.com/i/%s/my_tie?&pn=%d' % (self.user_id, page) 64 | log('-->%s' % url) 65 | _ = self.r.get(url, headers=self.headers) 66 | my_tie = BeautifulSoup(_.text, 'lxml').select('.simple_block_container')[0].ul 67 | lis = my_tie.select('li') 68 | if len(lis) == 0: 69 | break 70 | for li in lis: 71 | a = li.select('a') 72 | bar_name = a[0].text 73 | bar_url = 'https://tieba.baidu.com' + a[0]['href'] 74 | tie_name = a[1].text 75 | tie_url = 'https://tieba.baidu.com' + a[1]['href'] 76 | new_tie = { 77 | 'bar_name': bar_name, 78 | 'bar_url': bar_url, 79 | 'tie_name': tie_name, 80 | 'tie_url': tie_url 81 | } 82 | log('add new tie: [%s][%s]' % (bar_name, tie_name)) 83 | tie_list.append(new_tie) 84 | page += 1 85 | return tie_list 86 | # print(json.dumps(tie_list, ensure_ascii=False, indent=4)) 87 | 88 | def get_reply(self): 89 | reply_list = [] 90 | page = 1 91 | while(1): 92 | url = 'https://tieba.baidu.com/i/%s/my_reply?&pn=%d' % (self.user_id, page) 93 | log('-->%s' % (url)) 94 | _ = self.r.get(url, headers=self.headers) 95 | my_reply = BeautifulSoup(_.text, 'lxml').select('.t_forward') 96 | if len(my_reply) == 0: 97 | break 98 | # print(my_reply) 99 | for one in my_reply: 100 | try: 101 | reply_content = one.select('.for_reply_context')[0].text 102 | reply_url = 'https://tieba.baidu.com' + one.select('.for_reply_context')[0]['href'] 103 | except IndexError: 104 | reply_content = '' 105 | reply_url = 'https://tieba.baidu.com' + one.select('.b_reply')[0]['href'] 106 | tie_name = one.select('.thread_title')[0].text 107 | tie_url = 'https://tieba.baidu.com' + one.select('.thread_title')[0]['href'] 108 | bar = one.select('.common_source_main')[0].select('a')[-1] 109 | bar_name = bar.text 110 | bar_url = 'https://tieba.baidu.com' + bar['href'] 111 | new_reply = { 112 | 'reply_content': reply_content, 113 | 'reply_url': reply_url, 114 | 'tie_name': tie_name, 115 | 'tie_url': tie_url, 116 | 'bar_name': bar_name, 117 | 'bar_url': bar_url, 118 | } 119 | if re.match(self.match, reply_content): 120 | log('add new reply: [%s][%s]' % (reply_content, tie_name)) 121 | reply_list.append(new_reply) 122 | else: 123 | log('NOT match, pass: [%s][%s]' % (reply_content, tie_name)) 124 | page += 1 125 | return reply_list 126 | # print(json.dumps(reply_list, ensure_ascii=False, indent=4)) 127 | 128 | def del_tie(self, reply): 129 | log(json.dumps(reply, ensure_ascii=False, indent=4)) 130 | log('-->%s' % reply['tie_url']) 131 | _ = self.r.get(reply['tie_url'], headers=self.headers) 132 | html = _.text 133 | check = re.findall('该贴已被删除', html) 134 | if len(check) > 0: 135 | tid = re.findall('p/(\d+)\?', reply['tie_url'])[0] 136 | url = 'https://tieba.baidu.com/errorpage/deledErrorInfo?tid=%s' % tid 137 | error = json.loads(self.r.get(url, headers=self.headers).text) 138 | type_no = int(error['data']['type']) 139 | if type_no == 0: 140 | log('很抱歉,该贴已被删除') 141 | elif type_no == 1: 142 | log('小广告太多啦。商品交易贴,度娘建议每天不能超过5条哦') 143 | elif type_no == 2: 144 | log('亲,由于您使用机器刷贴,影响了吧友在贴吧的浏览体验,导致贴子被删') 145 | elif type_no == 3: 146 | log('亲,由于您的贴子内含有敏感词汇/图片,影响了吧友在贴吧的浏览体验,导致贴子被删') 147 | elif type_no == 4: 148 | log('很抱歉,您的贴子已被系统删除') 149 | elif type_no == 5: 150 | log('很抱歉,您的贴子已被自己删除') 151 | elif type_no == 6: 152 | log('很抱歉,您的贴子已被吧务删除') 153 | else: 154 | log('Failed') 155 | return False 156 | log('Success') 157 | return True 158 | if '该吧被合并您所访问的贴子无法显示' in html: 159 | log('该吧被合并您所访问的贴子无法显示') 160 | log('Success') 161 | return True 162 | elif '您访问的贴子被隐藏' in html: 163 | log('抱歉,您访问的贴子被隐藏,暂时无法访问') 164 | log('Failed') 165 | return False 166 | else: 167 | pass 168 | data = { 169 | 'ie': re.findall('\"?charset\"?\s*:\s*[\'\"]?(.*?)[\'\"]', html)[0].lower(), 170 | # 'tbs': re.findall('"tbs" : "([\d\w]+)"', html)[0], 171 | 'tbs': re.findall('\"?tbs\"?\s*:\s*[\'\"]?([\w\d]+)[\'\"]', html)[0], 172 | 'kw': re.findall('name="kw" value="(.*?)"', html)[0].encode().decode(), 173 | 'fid': re.findall("fid:'(\d+)'", html)[0], 174 | 'tid': re.findall("tid:'(\d+)'", html)[0], 175 | 'username': self.username, 176 | 'delete_my_post': 1, 177 | 'delete_my_thread' : 0, 178 | 'is_vipdel': 0, 179 | # 'pid': re.findall('pid=(\d+)&', reply['tie_url'])[0], 180 | 'pid': re.findall('cid=(\d+)#', reply['tie_url'])[0], 181 | 'is_finf': 'false' 182 | } 183 | if data['pid'] == '0': 184 | data['pid'] = re.findall('pid=(\d+)', reply['tie_url'])[0] 185 | 186 | url = 'https://tieba.baidu.com/f/commit/post/delete' 187 | log('-->%s' % url) 188 | log('delete reply') 189 | h = self.headers 190 | 191 | h.update({'Referer': reply['tie_url']}) 192 | log(data) 193 | 194 | _ = self.r.post(url, data=data, headers=h) 195 | log(_.status_code) 196 | 197 | # log(_.content.decode()) 198 | # log(_.request.headers) 199 | # exit(0) 200 | return self.error_check(_.text) 201 | 202 | 203 | def del_reply(self, reply): 204 | log(json.dumps(reply, ensure_ascii=False, indent=4)) 205 | log('-->%s' % reply['reply_url']) 206 | _ = self.r.get(reply['reply_url'], headers=self.headers) 207 | html = _.text 208 | if '该吧被合并您所访问的贴子无法显示' in html: 209 | log('该吧被合并您所访问的贴子无法显示') 210 | log('Success') 211 | return True 212 | elif '您访问的贴子被隐藏' in html: 213 | log('抱歉,您访问的贴子被隐藏,暂时无法访问') 214 | log('Failed') 215 | return False 216 | else: 217 | pass 218 | pid = re.findall('cid=(\d+)#', reply['reply_url'])[0] 219 | if pid == '0': 220 | pid = re.findall('pid=(\d+)&', reply['reply_url'])[0] 221 | data = { 222 | 'ie': re.findall('\"?charset\"?\s*:\s*[\'\"]?(.*?)[\'\"]', html)[0].lower(), 223 | 'tbs': re.findall('\"?tbs\"?\s*:\s*[\'\"]?([\w\d]+)[\'\"]', html)[0], 224 | 'kw': re.findall('name="kw" value="(.*?)"', html)[0].encode().decode(), 225 | 'fid': re.findall("fid:'(\d+)'", html)[0], 226 | 'tid': re.findall("tid:'(\d+)'", html)[0], 227 | 'username': self.username, 228 | 'delete_my_post': 1, 229 | 'delete_my_thread' : 0, 230 | 'is_vipdel': 0, 231 | # 'pid': re.findall('pid=(\d+)&', reply['reply_url'])[0], 232 | 'pid': pid, 233 | 'is_finf': 'false' 234 | } 235 | url = 'https://tieba.baidu.com/f/commit/post/delete' 236 | log('-->%s' % url) 237 | log('delete reply') 238 | _ = self.r.post(url, data=data, headers=self.headers) 239 | log(_.status_code) 240 | return self.error_check(_.text) 241 | 242 | 243 | 244 | 245 | def login(self): 246 | print('这次不用输入前缀`Cookie:`了,直接复制后面的key-value对') 247 | cookie = input('give me cookies[xxx=xxx; xxx=xxx]:') 248 | q = {k:v for k,v in re.findall(r'([^=]*)=([^;]*);{0,1}\s{0,1}', cookie)} 249 | self.r.cookies = cookiejar_from_dict(q) 250 | url = 'https://tieba.baidu.com' 251 | log('-->%s' % url) 252 | _ = self.r.get(url) 253 | self.username = re.findall('"user_name": "(.*?)",', _.text)[0] 254 | log('get username: %s' % (self.username)) 255 | url = 'https://tieba.baidu.com/home/profile?un=%s' % self.username 256 | log('-->%s' % url) 257 | _ = self.r.get(url) 258 | self.user_id = re.findall('user_id":(\d+)', _.text)[0] 259 | 260 | def start(self, input_file=True): 261 | if input_file: 262 | tie_json = input('Do you have tie json, give me file, if not, just enter:') 263 | if tie_json != '': 264 | tie_list = json.load(open(tie_json, 'r')) 265 | else: 266 | tie_list = self.get_tie() 267 | open('clean_tieba_tie_list.json', 'w').write(json.dumps(tie_list, ensure_ascii=False, indent=4)) 268 | else: 269 | if os.path.exists('clean_tieba_tie_fail.json'): 270 | log('load tie failed from file') 271 | tie_list = json.load(open('clean_tieba_tie_fail.json', 'r')) 272 | else: 273 | tie_list = self.get_tie() 274 | open('clean_tieba_tie_list.json', 'w').write(json.dumps(tie_list, ensure_ascii=False, indent=4)) 275 | 276 | 277 | if input_file: 278 | reply_json = input('Do you have reply json, give me file, if not, just enter:') 279 | if reply_json != '': 280 | reply_list = json.load(open(reply_json, 'r')) 281 | else: 282 | reply_list = self.get_reply() 283 | open('clean_tieba_tie_reply.json', 'w').write(json.dumps(reply_list, ensure_ascii=False, indent=4)) 284 | else: 285 | if os.path.exists('clean_tieba_reply_fail.json'): 286 | log('load reply failed from file') 287 | reply_list = json.load(open('clean_tieba_reply_fail.json', 'r')) 288 | else: 289 | reply_list = self.get_reply() 290 | open('clean_tieba_reply_list.json', 'w').write(json.dumps(reply_list, ensure_ascii=False, indent=4)) 291 | 292 | 293 | 294 | tie_count = len(tie_list) 295 | tie_fail = [] 296 | reply_count = len(reply_list) 297 | reply_fail = [] 298 | if tie_count == 0 and reply_count == 0: 299 | log('done') 300 | exit() 301 | 302 | tie_is_max = False 303 | for i in range(tie_count): 304 | log('tie: %d/%d' % (i + 1, tie_count)) 305 | if tie_is_max: 306 | tie_fail.append(tie_list[i]) 307 | continue 308 | status = self.del_tie(tie_list[i]) 309 | if status == 'exit': 310 | print('达到每日上限,等待下一轮') 311 | tie_is_max = True 312 | tie_fail.append(tie_list[i]) 313 | elif status == False: 314 | tie_fail.append(tie_list[i]) 315 | else: 316 | pass 317 | open('clean_tieba_tie_fail.json', 'w').write(json.dumps(tie_fail, ensure_ascii=False, indent=4)) 318 | 319 | 320 | reply_is_max = False 321 | for i in range(reply_count): 322 | log('reply: %d/%d' % (i + 1, reply_count)) 323 | if reply_is_max: 324 | reply_fail.append(reply_list[i]) 325 | continue 326 | status = self.del_reply(reply_list[i]) 327 | if status == 'exit': 328 | print('达到每日上限,等待下一轮') 329 | reply_is_max = True 330 | reply_fail.append(reply_list[i]) 331 | elif status == False: 332 | reply_fail.append(reply_list[i]) 333 | else: 334 | pass 335 | open('clean_tieba_reply_fail.json', 'w').write(json.dumps(reply_fail, ensure_ascii=False, indent=4)) 336 | 337 | 338 | if __name__ == '__main__': 339 | tieba = Tieba() 340 | parser = OptionParser() 341 | parser.add_option('-m', '--match', 342 | help="give me re format, if match in reply, I will delete") 343 | (options, args) = parser.parse_args() 344 | if options.match is not None: 345 | tieba.match = match 346 | log('match had set: (%s)' % tieba.match) 347 | else: 348 | log('match had set: (%s)' % tieba.match) 349 | 350 | tieba.login() 351 | tieba.start() 352 | 353 | while(1): 354 | sleep_hours = 4 355 | log('will sleep %d hours' % (sleep_hours)) 356 | for i in range(0, sleep_hours, 1): 357 | log('start after %d hours' % (sleep_hours - i)) 358 | time.sleep(60 * 60) 359 | tieba.start(False) 360 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | requests 2 | beautifulsoup4 3 | lxml 4 | --------------------------------------------------------------------------------