├── .gitignore ├── v0 ├── post.py ├── config.py └── run.py ├── README.md └── v1 ├── app.py └── client.py /.gitignore: -------------------------------------------------------------------------------- 1 | **/__pycache__ 2 | *.html 3 | *.json 4 | *.log 5 | **/Profiles 6 | **/temp 7 | **/data 8 | **/download -------------------------------------------------------------------------------- /v0/post.py: -------------------------------------------------------------------------------- 1 | class Post: 2 | def __init__(self, pid, plikenum, pbadge, pcontent, ptime, pquote=None, pimage=False): 3 | self.id = pid 4 | self.likenum = plikenum 5 | self.badge = pbadge 6 | self.content = pcontent 7 | self.time = ptime 8 | self.quote = pquote 9 | self.image = pimage 10 | self.replies = [] 11 | 12 | def add_reply(self, rid, name, rcontent, rtime, rquote): 13 | self.replies.append(Reply(rid, name, rcontent, rtime, rquote)) 14 | 15 | 16 | class Reply: 17 | def __init__(self, rid, name, rcontent, rtime, rquote): 18 | self.id = rid 19 | self.name = name 20 | self.content = rcontent 21 | self.time = rtime 22 | self.quote = rquote -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # PKUHoleCrawler 2 | # v1 3 | 由于目前对于树洞的api有更深入的了解,抛弃了之前使用selenium的思路,而直接使用requests访问api。优势在于依赖项更少,运行更加稳定,效率更高,同时支持指定编号。 4 | 5 | ## 运行 6 | client.py提供了一些底层的方法与树洞api交互,而app.py提供了一些集成的方法来批量获得数据。 7 | 8 | 主要功能位于`App.get_posts()`方法中,支持给定编号列表。 9 | 10 | 登录时可能会要求~~手机令牌或~~短信验证,跟随指引输入即可。 11 | 12 | config.json 13 | ```json 14 | { 15 | "username": "your_username", 16 | "password": "your_password", 17 | "secret_key": "your_secret_key" 18 | } 19 | ``` 20 | `secret_key`用于自动生成手机令牌。可以通过绑定手机令牌时抓包获取。 21 | 22 | ## 备注 23 | 由于现在直接多线程调用api,请务必减小单次爬取的条数,否则有被封号的危险。 24 | # v0 25 | (一个简易的)北大树洞爬虫,基于Selenium动态爬取网页内容。 26 | 27 | 基于[luciusssss/PKUHoleCrawler: 北大树洞爬虫 (github.com)](https://github.com/luciusssss/PKUHoleCrawler)的改进,适用于新版本树洞[北大树洞 (pku.edu.cn)](https://treehole.pku.edu.cn)。目前支持Edge浏览器与Firefox浏览器。 28 | 29 | ## 配置 30 | 31 | 安装selenium 32 | 33 | ``` 34 | pip3 install selenium 35 | ``` 36 | 37 | 为了实现自动登录,需要拷贝浏览器的用户数据,Edge的用户数据默认位于`C:\Users\YourName\AppData\Local\Microsoft\Edge\User Data`(Windows)或`/home/YourName/.config/microsoft-edge/User Data`,Firefox的用户数据默认位于`C:\Users\YourName\AppData\Roaming\Mozilla\Firefox\Profiles\`(Windows)或`/home/你YourName/.mozilla/firefox/`(Linux)下的某个随机名称文件夹(例如`32fy5laa.default-release`),需要在原浏览器上保留登录状态(即访问https://treehole.pku.edu.cn不会跳转至登录界面)。 38 | 39 | 浏览器对应的webdriver按需安装。 40 | 41 | ## 运行 42 | 43 | 使用`config.py`修改运行参数: 44 | 45 | ``` 46 | config.py [-h] [--crawl_size CRAWL_SIZE] [--part PART] [--browser BROWSER] [--profiles_path PROFILES_PATH] 47 | ``` 48 | 49 | 然后运行`run.py`: 50 | 51 | ``` 52 | python3 run.py 53 | ``` 54 | 55 | 爬取文本结果将按照`part`条树洞一组存储在`tree_hole_{part}_{start}-{end}({utc-time}).json`中; 56 | 57 | 图片将存储于`download`文件夹下,格式为`image_{pid}.png`。 58 | 59 | 命令行界面: 60 | 61 | ``` 62 | Log in successfully 63 | 第1部分: |██████████| 500/500 100.0% 总进度: |██████████| 2000/2000 100.0% 64 | Crawling done 65 | Press any key to finish... 66 | ``` 67 | 68 | 69 | 70 | ## 备注 71 | 由于原先的架构依赖于selenium以及webdriver,稳定性较差,效率较低,现已重新写了一套代码。v0版的代码将不在维护。 72 | 73 | 如果你希望对浏览过程有可视化的掌控,也可以在此基础上加以改进。 74 | 75 | -------------------------------------------------------------------------------- /v0/config.py: -------------------------------------------------------------------------------- 1 | import json 2 | import os 3 | import argparse 4 | 5 | class WebConfig: 6 | _default_config = { 7 | "browser": "chrome", 8 | "profiles_path": "/path/to/profiles", 9 | "crawl_size": 1000, 10 | "part": 200, 11 | "mode": "Normal" 12 | } 13 | 14 | _config_file = "config.json" 15 | 16 | def __init__(self): 17 | self.load_config() 18 | 19 | def load_config(self): 20 | if os.path.exists(self._config_file): 21 | with open(self._config_file, 'r') as f: 22 | self._config = json.load(f) 23 | else: 24 | self._config = self._default_config 25 | self.save_config() 26 | 27 | def save_config(self): 28 | with open(self._config_file, 'w') as f: 29 | json.dump(self._config, f, indent=4) 30 | 31 | @property 32 | def browser(self): 33 | return self._config.get("browser", self._default_config["browser"]) 34 | 35 | @browser.setter 36 | def browser(self, value): 37 | self._config["browser"] = value 38 | self.save_config() 39 | 40 | @property 41 | def profiles_path(self): 42 | return self._config.get("profiles_path", self._default_config["profiles_path"]) 43 | 44 | @profiles_path.setter 45 | def profiles_path(self, value): 46 | self._config["profiles_path"] = value 47 | self.save_config() 48 | 49 | @property 50 | def crawl_size(self): 51 | return self._config.get("crawl_size", self._default_config["crawl_size"]) 52 | 53 | @crawl_size.setter 54 | def crawl_size(self, value): 55 | self._config["crawl_size"] = value 56 | self.save_config() 57 | 58 | @property 59 | def part(self): 60 | return self._config.get("part", self._default_config["part"]) 61 | 62 | @part.setter 63 | def part(self, value): 64 | self._config["part"] = value 65 | self.save_config() 66 | 67 | @property 68 | def mode(self): 69 | return self._config.get("mode", self._default_config["mode"]) 70 | 71 | @mode.setter 72 | def mode(self, value): 73 | self._config["mode"] = value 74 | self.save_config() 75 | 76 | if __name__ == "__main__": 77 | parse = argparse.ArgumentParser() 78 | 79 | parse.add_argument('--mode', choices=['Normal', 'Full', 'Specific']) 80 | parse.add_argument('--crawl_size', type=int, default=1000) 81 | parse.add_argument('--part', type=int, default=200) 82 | parse.add_argument('--browser',choices=['Firefox', 'Edge']) 83 | parse.add_argument('--profiles_path') 84 | 85 | args = vars(parse.parse_args()) 86 | 87 | webconfig = WebConfig() 88 | webconfig.browser = args['browser'] 89 | webconfig.profiles_path = args['profiles_path'] 90 | webconfig.crawl_size = args['crawl_size'] 91 | webconfig.part = args['part'] -------------------------------------------------------------------------------- /v1/app.py: -------------------------------------------------------------------------------- 1 | from client import Client 2 | import getpass 3 | from concurrent.futures import ThreadPoolExecutor 4 | import os 5 | import datetime 6 | import json 7 | import pyotp 8 | 9 | class App: 10 | def __init__(self): 11 | self.client = Client() 12 | self.executor = ThreadPoolExecutor(max_workers=20) 13 | self.current_dir = os.path.dirname(os.path.abspath(__file__)) 14 | if not os.path.exists(os.path.join(self.current_dir, 'data', 'download')): 15 | os.makedirs(os.path.join(self.current_dir, 'data', 'download')) 16 | 17 | with open("config.json", encoding="utf-8") as file: 18 | data = json.load(file) 19 | 20 | self.username = data["username"] if "username" in data else None 21 | self.password = data["password"] if "password" in data else None 22 | self.secret_key = data["secret_key"] if "secret_key" in data else None 23 | 24 | response = self.client.un_read() 25 | while response.status_code != 200: 26 | print(f"{response.status_code}: 需要登录") 27 | if self.username and self.password: 28 | username = self.username 29 | password = self.password 30 | else: 31 | username = input('username: ') 32 | password = getpass.getpass('password: ') 33 | token = self.client.oauth_login(username, password)["token"] 34 | self.client.sso_login(token) 35 | response = self.client.un_read() 36 | 37 | while not response.json()["success"]: 38 | if response.json()["message"] == "请手机短信验证": 39 | tmp = input("发送验证码(Y/n):") 40 | if tmp == 'Y': 41 | self.client.send_message() 42 | code = input("短信验证码:") 43 | self.client.login_by_message(code) 44 | elif response.json()["message"] == "请进行令牌验证": 45 | if self.secret_key: 46 | totp = pyotp.TOTP(self.secret_key) 47 | token = totp.now() 48 | print(f"自动生成OTP令牌:{token}") 49 | else: 50 | token = input("手机令牌:") 51 | self.client.login_by_token(token) 52 | response = self.client.un_read() 53 | self.client.save_cookies() 54 | 55 | def browse(self, page=1, limit=25): 56 | response = self.client.search(page=page, limit=limit) 57 | posts = response.json()["data"]["data"] 58 | return posts 59 | 60 | def read(self, post_id): 61 | post = self.client.get_post(post_id) 62 | if post["success"]: 63 | post = post["data"] 64 | 65 | reply = post["reply"] 66 | likenum = post["likenum"] 67 | text = post["text"] 68 | print(f"{post_id} reply:{reply} likenum:{likenum}") 69 | print(text) 70 | else: 71 | print(f"{post_id}: {post["message"]}") 72 | 73 | def get_post(self, post_id): 74 | post = self.client.get_post(post_id) 75 | if post["success"]: 76 | post = post["data"] 77 | if post["type"] == "image": 78 | image_type = post["url"].split(".")[-1] 79 | self.client.get_image(post_id, os.path.join(self.current_dir, 'data', 'download', post_id) + "." + image_type) 80 | comments = self.client.get_comment(post_id)["data"] 81 | 82 | if comments: 83 | last_page = comments["last_page"] 84 | for page in range(2, last_page + 1): 85 | part_comments = self.client.get_comment(post_id, page)["data"] 86 | comments["data"] += part_comments["data"] 87 | comments = comments["data"] 88 | else: 89 | comments = [] 90 | return post, comments 91 | else: 92 | return {'pid': int(post_id), 'text': '您查看的树洞不存在', 'type': 'text'}, [] 93 | 94 | def get_posts(self, posts): 95 | posts_data = [] 96 | post_ids = [] 97 | futures = [self.executor.submit(lambda post_id=post_id: self.get_post(post_id)) for post_id in posts] 98 | for future in futures: 99 | post, comments = future.result() 100 | posts_data.append({"post": post, "comments": comments}) 101 | post_ids.append(post["pid"]) 102 | current_dir = os.path.dirname(os.path.abspath(__file__)) 103 | data_name = f'{max(post_ids)}-{min(post_ids)}.json' 104 | with open(os.path.join(current_dir, 'data', data_name), 'w', encoding='utf-8') as file: 105 | json.dump(posts_data, file, indent=4, ensure_ascii=False) 106 | 107 | 108 | 109 | if __name__ == "__main__": 110 | app = App() 111 | while True: 112 | post_id = input("post id: ") 113 | app.read(post_id) 114 | 115 | 116 | 117 | 118 | 119 | -------------------------------------------------------------------------------- /v1/client.py: -------------------------------------------------------------------------------- 1 | import requests 2 | import enum 3 | import random 4 | import re 5 | import os 6 | import json 7 | import uuid 8 | from http.cookiejar import Cookie 9 | 10 | class TreeHoleWeb(enum.Enum): 11 | OAUTH_LOGIN = "https://iaaa.pku.edu.cn/iaaa/oauthlogin.do" 12 | REDIR_URL = "https://treehole.pku.edu.cn/cas_iaaa_login?uuid=fc71db5799cf&plat=web" 13 | SSO_LOGIN = "http://treehole.pku.edu.cn/cas_iaaa_login" 14 | UN_READ = "https://treehole.pku.edu.cn/api/mail/un_read" 15 | SEARCH = "https://treehole.pku.edu.cn/api/pku_hole" 16 | COMMENT = "https://treehole.pku.edu.cn/api/pku_comment_v3" 17 | FOLLOW = "https://treehole.pku.edu.cn/api/pku_attention" 18 | GET_FOLLOW = "https://treehole.pku.edu.cn/api/follow_v2" 19 | REPORT = "https://treehole.pku.edu.cn/api/pku_comment/report" 20 | LOGIN_BY_TOKEN = "https://treehole.pku.edu.cn/api/login_iaaa_check_token" 21 | LOGIN_BY_MESSAGE = "https://treehole.pku.edu.cn/api/jwt_msg_verify" 22 | SEND_MESSAGE = "https://treehole.pku.edu.cn/api/jwt_send_msg" 23 | COURSE_TABLE = "https://treehole.pku.edu.cn/api/getCoursetable_v2" 24 | GRADE = "https://treehole.pku.edu.cn/api/course/score_v2" 25 | 26 | 27 | class Client: 28 | def __init__(self): 29 | self.session = requests.Session() 30 | self.session.headers.update({ 31 | "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/135.0.0.0 Safari/537.36 Edg/135.0.0.0" 32 | }) 33 | self.load_cookies() 34 | if "pku_token" in self.session.cookies.keys(): 35 | self.authorization = self.session.cookies.values()[self.session.cookies.keys().index("pku_token")] 36 | self.session.headers.update({"authorization": f"Bearer {self.authorization}"}) 37 | 38 | def oauth_login(self, username, password): 39 | response = self.session.post(TreeHoleWeb.OAUTH_LOGIN.value, data={ 40 | 'appid': "PKU Helper", 41 | 'userName': username, 42 | 'password': password, 43 | 'randCode': '', 44 | 'smsCode': '', 45 | 'otpCode': '', 46 | 'redirUrl': TreeHoleWeb.REDIR_URL.value 47 | }) 48 | response.raise_for_status() 49 | return response.json() 50 | 51 | def sso_login(self, token): 52 | rand = str(random.random()) 53 | response = self.session.get(TreeHoleWeb.SSO_LOGIN.value, params={ 54 | 'uuid': str(uuid.uuid4()).split("-")[-1], 55 | 'plat': "web", 56 | '_rand': rand, 57 | 'token': token 58 | }) 59 | response.raise_for_status() 60 | print(response.status_code, response.headers) 61 | 62 | self.authorization = re.search(r'token=(.*)', response.url).group(1) 63 | self.session.cookies.update({"pku_token": self.authorization}) 64 | self.session.headers.update({"authorization": f"Bearer {self.authorization}"}) 65 | return response 66 | 67 | def un_read(self): 68 | response = self.session.get(TreeHoleWeb.UN_READ.value) 69 | 70 | return response 71 | 72 | def login_by_token(self, token): 73 | response = self.session.post(TreeHoleWeb.LOGIN_BY_TOKEN.value, data={'code': token}) 74 | response.raise_for_status() 75 | print(response.status_code, response.json()) 76 | return response 77 | 78 | def login_by_message(self, code): 79 | response = self.session.post(TreeHoleWeb.LOGIN_BY_MESSAGE.value, data={'valid_code': code}) 80 | response.raise_for_status() 81 | print(response.status_code, response.json()) 82 | return response 83 | 84 | def send_message(self): 85 | response = self.session.post(TreeHoleWeb.SEND_MESSAGE.value) 86 | response.raise_for_status() 87 | return response 88 | 89 | def get_post(self, post_id): 90 | response = self.session.get(f"https://treehole.pku.edu.cn/api/pku/{post_id}") 91 | response.raise_for_status() 92 | return response.json() 93 | 94 | def get_comment(self, post_id, page=1, limit=15, sort="asc"): 95 | response = self.session.get(f"https://treehole.pku.edu.cn/api/pku_comment_v3/{post_id}", params={ 96 | "page": page, 97 | "limit": limit, 98 | "sort": sort 99 | }) 100 | response.raise_for_status() 101 | return response.json() 102 | 103 | def get_image(self, post_id, file_name): 104 | response = self.session.get(f"https://treehole.pku.edu.cn/api/pku_image/{post_id}", stream=True) 105 | if response.status_code == 200: 106 | with open(f"{file_name}", "wb") as file: 107 | for chunk in response.iter_content(1024): 108 | file.write(chunk) 109 | 110 | def search(self, keyword=None, page=1, limit=25, label=None): 111 | response = self.session.get(TreeHoleWeb.SEARCH.value, params={ 112 | "page": page, 113 | "limit": limit, 114 | "keyword": keyword, 115 | "label": label 116 | }) 117 | return response 118 | 119 | def follow(self, post_id): 120 | response = self.session.post(TreeHoleWeb.FOLLOW.value + f"/{post_id}") 121 | return response 122 | 123 | def get_follow(self, page=1, limit=25): 124 | response = self.session.get(TreeHoleWeb.GET_FOLLOW.value, params={ 125 | "page": page, 126 | "limit": limit 127 | }) 128 | return response 129 | 130 | def comment(self, post_id, text, comment_id=None): 131 | response = self.session.post(TreeHoleWeb.COMMENT.value, data={ 132 | "comment_id": comment_id, 133 | "pid": post_id, 134 | "text": text 135 | } if comment_id else { 136 | "pid": post_id, 137 | "text": text 138 | }) 139 | return response 140 | 141 | def report(self, tp, xid, other, reason): 142 | if tp == 'post': 143 | post_id = xid 144 | response = self.session.post(TreeHoleWeb.REPORT.value + f"/{post_id}", data={ 145 | "other": other, 146 | "reason": reason 147 | }) 148 | elif tp == 'comment': 149 | comment_id = xid 150 | response = self.session.post(TreeHoleWeb.REPORT.value, data={ 151 | "cid": comment_id, 152 | "other": other, 153 | "reason": reason 154 | }) 155 | return response 156 | 157 | def get_course_table(self): 158 | response = self.session.get(TreeHoleWeb.COURSE_TABLE.value) 159 | return response 160 | 161 | def get_grade(self): 162 | response = self.session.get(TreeHoleWeb.GRADE.value) 163 | return response 164 | 165 | def save_cookies(self): 166 | cookies_list = [] 167 | for cookie in self.session.cookies: 168 | cookie_dict = { 169 | 'name': cookie.name, 170 | 'value': cookie.value, 171 | 'domain': cookie.domain, 172 | 'path': cookie.path, 173 | 'expires': cookie.expires if cookie.expires else None, 174 | 'secure': cookie.secure, 175 | 'rest': {'HttpOnly': cookie.has_nonstandard_attr('HttpOnly')} 176 | } 177 | cookies_list.append(cookie_dict) 178 | 179 | current_path = os.path.abspath(__file__) 180 | cookie_path = os.path.join(os.path.dirname(current_path), "cookies.json") 181 | with open(cookie_path, 'w') as f: 182 | json.dump(cookies_list, f, indent=4) 183 | 184 | def load_cookies(self): 185 | current_path = os.path.abspath(__file__) 186 | cookie_path = os.path.join(os.path.dirname(current_path), "cookies.json") 187 | try: 188 | with open(cookie_path, 'r') as f: 189 | cookies_list = json.load(f) 190 | self.session.cookies.clear() 191 | for cookie_dict in cookies_list: 192 | cookie = Cookie( 193 | version=0, 194 | name=cookie_dict['name'], 195 | value=cookie_dict['value'], 196 | port=None, 197 | port_specified=False, 198 | domain=cookie_dict['domain'], 199 | domain_specified=bool(cookie_dict['domain']), 200 | domain_initial_dot=cookie_dict['domain'].startswith('.'), 201 | path=cookie_dict['path'], 202 | path_specified=bool(cookie_dict['path']), 203 | secure=cookie_dict['secure'], 204 | expires=cookie_dict['expires'], 205 | discard=False, 206 | comment=None, 207 | comment_url=None, 208 | rest=cookie_dict['rest'] 209 | ) 210 | self.session.cookies.set_cookie(cookie) 211 | 212 | except Exception as e: 213 | print(e) 214 | -------------------------------------------------------------------------------- /v0/run.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import os 3 | import time 4 | import datetime 5 | import json 6 | import re 7 | import base64 8 | import shutil 9 | import psutil 10 | import logging 11 | 12 | 13 | from selenium import webdriver 14 | from selenium.webdriver.chrome.options import Options 15 | from selenium.webdriver.firefox.options import Options as FirefoxOptions 16 | from selenium.webdriver.firefox.service import Service 17 | from selenium.webdriver.firefox.firefox_profile import FirefoxProfile 18 | from selenium.webdriver.edge.options import Options as EdgeOptions 19 | from selenium.webdriver.edge.service import Service 20 | 21 | from selenium.webdriver.common.by import By 22 | from selenium.webdriver.common.action_chains import ActionChains 23 | from selenium.webdriver.common.keys import Keys 24 | from selenium.webdriver.support import expected_conditions as EC 25 | from selenium.webdriver.support.ui import WebDriverWait 26 | import time 27 | from post import Post, Reply 28 | from config import WebConfig 29 | 30 | 31 | def print_progress(iteration, total, prefix, suffix, decimals, length, fill='█'): 32 | terminal_size = shutil.get_terminal_size() 33 | max_length = terminal_size.columns 34 | out = '\r' 35 | p = True 36 | for i in range(len(iteration)): 37 | percent = ( 38 | "{0:." + str(decimals[i]) + "f}").format(100 * (iteration[i] / float(total[i]))) 39 | filled_length = int(length[i] * iteration[i] // total[i]) 40 | bar = fill * filled_length + '-' * (length[i] - filled_length) 41 | out += f'{prefix[i]} |{bar}| {iteration[i]}/{total[i]} {percent}% {suffix[i]}' 42 | p = p and iteration[i] >= total[i] 43 | 44 | sys.stdout.write(out[:max_length-2]) 45 | sys.stdout.flush() 46 | if p: 47 | print() 48 | 49 | 50 | def scroll_element(driver, element): 51 | element_location = element.location_once_scrolled_into_view 52 | element_height = element.size['height'] 53 | scroll_y = element_location['y'] + element_height 54 | driver.execute_script(""" 55 | window.scrollTo({ 56 | top: arguments[0], 57 | behavior: 'smooth' 58 | }); 59 | """, scroll_y) 60 | 61 | 62 | def save_html(driver, file_name='index.html'): 63 | html = driver.page_source 64 | with open(file_name, 'w', encoding='utf-8') as file: 65 | file.write(html) 66 | 67 | 68 | def convert_posts_to_json(posts, file_name='output.json'): 69 | # print('Saving into json...') 70 | output = [] 71 | for post in posts: 72 | output.append({ 73 | 'id': post.id, 74 | 'likenum': post.likenum, 75 | 'badge': post.badge, 76 | 'content': post.content, 77 | 'time': str(post.time), 78 | 'quote': post.quote, 79 | 'replies': [ 80 | { 81 | 'id': reply.id, 82 | 'name': reply.name, 83 | 'content': reply.content, 84 | 'time': str(reply.time), 85 | 'quote': reply.quote 86 | } 87 | for reply in post.replies 88 | ], 89 | 'tip': post.tip 90 | }) 91 | current_dir = os.path.dirname(os.path.abspath(__file__)) 92 | if not os.path.exists(os.path.join(current_dir, 'data')): 93 | os.makedirs(os.path.join(current_dir, 'data')) 94 | file_path = os.path.join(current_dir, 'data', file_name) 95 | json.dump(output, open(file_path, 'w', encoding='utf-8'), 96 | ensure_ascii=False, indent=2) 97 | 98 | 99 | def get_image(box_content, image_name): 100 | img_element = box_content.find_element( 101 | By.XPATH, ".//p[@class='img']/a/img[starts-with(@src, 'blob:')]") 102 | result = driver.execute_async_script(""" 103 | var img = arguments[0]; 104 | var callback = arguments[1]; 105 | var xhr = new XMLHttpRequest(); 106 | xhr.open('GET', img.src, true); 107 | xhr.responseType = 'blob'; 108 | xhr.onload = function(e) { 109 | if (this.status == 200) { 110 | var reader = new FileReader(); 111 | reader.onloadend = function() { 112 | callback(reader.result); 113 | } 114 | reader.readAsDataURL(this.response); 115 | } else { 116 | callback(null); 117 | } 118 | }; 119 | xhr.send(); 120 | """, img_element) 121 | if result and 'data:image' in result: 122 | current_dir = os.path.dirname(os.path.abspath(__file__)) 123 | if not os.path.exists(os.path.join(current_dir, 'data', 'download')): 124 | os.makedirs(os.path.join(current_dir, 'data', 'download')) 125 | image_path = os.path.join(current_dir, 'data', 'download', image_name) 126 | image_data = result.split(',')[1] 127 | with open(image_path, 'wb') as f: 128 | f.write(base64.b64decode(image_data)) 129 | # print("download a image") 130 | else: 131 | pass 132 | # print("cannot download") 133 | 134 | 135 | def extract_post(post_tree, crawled_pids): 136 | try: 137 | pid = post_tree.find_element( 138 | By.XPATH, ".//div[@class='flow-item']//code[@class='box-id --box-id-copy-content']").get_attribute('textContent').strip() 139 | 140 | if pid in crawled_pids: 141 | return None 142 | else: 143 | crawled_pids.add(pid) 144 | try: 145 | pquote = post_tree.find_element( 146 | By.XPATH, ".//div[@class='flow-item flow-item-quote']/div[@class='box']/div[@class='box-header']//code[@class='box-id --box-id-copy-content']").get_attribute('textContent').strip() 147 | except: 148 | pquote = None 149 | try: 150 | plikenum = post_tree.find_element( 151 | By.XPATH, ".//div[@class='flow-item']//span[@class='box-header-badge likenum']").get_attribute('textContent').strip() 152 | except: 153 | plikenum = 0 154 | try: 155 | pbadge = post_tree.find_element( 156 | By.XPATH, ".//div[@class='flow-item']//span[@class='box-header-badge']").get_attribute('textContent').strip() 157 | except: 158 | pbadge = 0 159 | pcontent_fold_body = post_tree.find_element( 160 | By.XPATH, ".//div[@class='flow-item']//div[@class='box-content']//div[@class='content-fold-body']") 161 | pcontent = pcontent_fold_body.get_attribute('textContent') 162 | try: 163 | get_image(pcontent_fold_body, f'image_{pid}.png') 164 | pimage = True 165 | except: 166 | pimage = False 167 | ptime = post_tree.find_element( 168 | By.XPATH, ".//div[@class='flow-item']//div[@class='box-header']").get_attribute('textContent').strip() 169 | ptime = re.search(r'\d{2}-\d{2}\s\d{2}:\d{2}', ptime).group() 170 | 171 | new_post = Post(pid, plikenum, pbadge, pcontent, ptime, pquote, pimage) 172 | except Exception as e: 173 | html_code = driver.execute_script( 174 | "return arguments[0].outerHTML;", post_tree) 175 | logger.error(f'cannot extract post: {e} \n html code:\n{html_code}') 176 | # print(e) 177 | return None 178 | 179 | post = post_tree.find_element(By.XPATH, ".//div[@class='flow-item']") 180 | post = post.find_element(By.XPATH, "..") 181 | try: 182 | pbox_tip = post.find_element( 183 | By.XPATH, ".//div[@class=box box-tip]").text 184 | new_post.tip = pbox_tip 185 | except: 186 | new_post.tip = None 187 | for reply_tree in post.find_elements(By.XPATH, ".//div[@class='flow-reply box dialog-hole-reply']"): 188 | rid = reply_tree.find_element( 189 | By.XPATH, ".//code[@class='box-id']").get_attribute('textContent').strip() 190 | 191 | rtime_ = reply_tree.find_element( 192 | By.XPATH, "./div[@class='box-header']") 193 | rtime = rtime_.get_attribute('textContent').strip() 194 | if not rtime: 195 | html_code = driver.execute_script( 196 | "return arguments[0].outerHTML;", rtime_) 197 | print(html_code) 198 | rtime = re.search(r'\d{2}-\d{2}\s\d{2}:\d{2}', rtime).group() 199 | # rtime = datetime.datetime.strptime(rtime,'%Y-%m-%dT%H:%M:%S') 200 | rbox = reply_tree.find_element(By.XPATH, "./div[@class='box-content']") 201 | try: 202 | rquote = rbox.find_element( 203 | By.XPATH, "./div[contains(@class, 'quote')]").get_attribute('textContent').strip() 204 | except: 205 | rquote = None 206 | rcontents = rbox.find_elements(By.XPATH, "./span") 207 | name = rcontents[1].get_attribute('textContent') 208 | if rquote: 209 | quote_name = rcontents[-2].get_attribute('textContent') 210 | else: 211 | quote_name = None 212 | rcontent = rcontents[-1].get_attribute('textContent')[2:] 213 | if rquote: 214 | new_post.add_reply(rid, name, rcontent, rtime, 215 | (quote_name, rquote)) 216 | else: 217 | new_post.add_reply(rid, name, rcontent, rtime, None) 218 | 219 | return new_post 220 | 221 | 222 | def get_posts(driver, crawled_pids): 223 | posts = [] 224 | post_trees = driver.find_elements( 225 | By.XPATH, "//div[@class='flow-chunk']/div") 226 | ''' 227 | driver.execute_script(""" 228 | arguments[0].scrollIntoView({ 229 | behavior: 'smooth', 230 | block: 'start' 231 | }); 232 | """, post_trees[-1]) 233 | ''' 234 | for post_tree in post_trees: 235 | # print('a new post') 236 | new_post = extract_post(post_tree, crawled_pids) 237 | if new_post != None: 238 | posts.append(new_post) 239 | scroll_element(driver, post_trees[-1]) 240 | for i in range(len(post_trees) - 3): 241 | post_tree = post_trees[i] 242 | try: 243 | driver.execute_script(""" 244 | arguments[0].parentNode.removeChild(arguments[0]); 245 | """, post_tree) 246 | except: 247 | logger.warning('cannot delete node') 248 | 249 | return posts 250 | 251 | 252 | if __name__ == '__main__': 253 | webconfig = WebConfig() 254 | logger = logging.getLogger() 255 | logger.setLevel(logging.INFO) 256 | log_name = f'{datetime.datetime.now(datetime.UTC).strftime("UTC%Y-%m-%d %H%M%S")}.log' 257 | file_handler = logging.FileHandler(log_name, encoding='utf-8') 258 | formatter = logging.Formatter('%(asctime)s - %(levelname)s - %(message)s') 259 | file_handler.setFormatter(formatter) 260 | logger.addHandler(file_handler) 261 | mode = webconfig.mode 262 | browser = webconfig.browser 263 | profiles_path = webconfig.profiles_path 264 | crawl_size = webconfig.crawl_size 265 | part = webconfig.part 266 | 267 | logger.info( 268 | f'browser={browser}, profiles_path={profiles_path}, crawl_size={crawl_size}, part={part}') 269 | 270 | if browser == 'Firefox': 271 | options = FirefoxOptions() 272 | firefox_profile = FirefoxProfile(profiles_path) 273 | options.profile = firefox_profile 274 | # options.add_argument("--headless") 275 | driver = webdriver.Firefox(options=options) 276 | elif browser == 'Edge': 277 | options = EdgeOptions() 278 | options.add_argument(rf'user-data-dir={profiles_path}') 279 | # options.add_argument("--headless=new") 280 | driver = webdriver.Edge(options=options) 281 | driver.execute_script(""" 282 | Object.defineProperty(Navigator, 'webdriver', {get: () => undefined}); 283 | Object.defineProperty(navigator, 'webdriver', {get: () => false}); 284 | """) 285 | driver.get('https://treehole.pku.edu.cn') 286 | time.sleep(1) 287 | current_url = driver.current_url 288 | if current_url == 'https://treehole.pku.edu.cn/web/verification': 289 | content = driver.find_element(By.TAG_NAME, 'body').text 290 | if '短信' in content: 291 | logger.error('need message verification') 292 | elif '手机令牌' in content: 293 | logger.error('need mobile token') 294 | else: 295 | logger.error('unknown error') 296 | print('Fail to log') 297 | driver.close() 298 | elif current_url.startswith('https://treehole.pku.edu.cn'): 299 | logger.info('Log in successfully') 300 | print('Log in successfully') 301 | time.sleep(5) 302 | # save_html(driver) 303 | 304 | posts = [] 305 | crawled_pids = set([]) 306 | i = 1 307 | total_length = 0 308 | memory_warn = False 309 | timeout_warn = False 310 | while (total_length < crawl_size): 311 | memory = psutil.virtual_memory() 312 | if not memory_warn and memory.available <= 524288000: 313 | memory_warn = True 314 | logger.warning( 315 | f'lack of available memory: {memory.available / (1024**2):.2f} MB') 316 | elif memory.available <= 262144000: 317 | logger.error( 318 | f'serious lack of available memory: {memory.available / (1024**2):.2f} MB') 319 | break 320 | new_posts = get_posts(driver, crawled_pids) 321 | if not timeout_warn: 322 | if len(new_posts) == 0: 323 | t_start = time.time() 324 | timeout_warn = True 325 | else: 326 | if len(new_posts) == 0: 327 | t_end = time.time() 328 | if t_end - t_start >= 20: 329 | logger.error(f'time out: {(t_end - t_start):.2f}s') 330 | break 331 | else: 332 | timeout_warn = False 333 | 334 | posts += new_posts 335 | if len(posts) >= part: 336 | start = posts[0].id 337 | end = posts[part - 1].id 338 | now = datetime.datetime.now(datetime.UTC) 339 | now = now.strftime("UTC%Y-%m-%d %H%M%S") 340 | json_name = f'tree_hole_{part}_{start}-{end}({now}).json' 341 | convert_posts_to_json( 342 | posts[:part], file_name=json_name) 343 | total_length += part 344 | logger.info(f'part {i} done: {total_length}/{crawl_size}') 345 | i += 1 346 | posts = posts[part:] 347 | print_progress((len(posts), total_length), (part, crawl_size), 348 | (f'第{i}部分:', '总进度:'), (' ', ''), (1, 1), (10, 10)) 349 | time.sleep(0.1) 350 | logger.info('crawling done') 351 | print('Crawling done') 352 | input("Press any key to finish...") 353 | driver.close() 354 | 355 | else: 356 | logger.warning('fail to log, try again') 357 | print('Fail to log') 358 | # input('Press any key to quit...') 359 | driver.close() --------------------------------------------------------------------------------