├── .gitignore
├── v0
    ├── post.py
    ├── config.py
    └── run.py
├── README.md
└── v1
    ├── app.py
    └── client.py


/.gitignore:
--------------------------------------------------------------------------------
1 | **/__pycache__
2 | *.html
3 | *.json
4 | *.log
5 | **/Profiles
6 | **/temp
7 | **/data
8 | **/download


--------------------------------------------------------------------------------
/v0/post.py:
--------------------------------------------------------------------------------
 1 | class Post:
 2 |     def __init__(self, pid, plikenum, pbadge, pcontent, ptime, pquote=None, pimage=False):
 3 |         self.id = pid
 4 |         self.likenum = plikenum
 5 |         self.badge = pbadge
 6 |         self.content = pcontent
 7 |         self.time = ptime
 8 |         self.quote = pquote
 9 |         self.image = pimage
10 |         self.replies = []
11 | 
12 |     def add_reply(self, rid, name, rcontent, rtime, rquote):
13 |         self.replies.append(Reply(rid, name, rcontent, rtime, rquote))
14 | 
15 | 
16 | class Reply:
17 |     def __init__(self, rid, name, rcontent, rtime, rquote):
18 |         self.id = rid
19 |         self.name = name
20 |         self.content = rcontent
21 |         self.time = rtime
22 |         self.quote = rquote


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # PKUHoleCrawler
 2 | # v1
 3 | 由于目前对于树洞的api有更深入的了解，抛弃了之前使用selenium的思路，而直接使用requests访问api。优势在于依赖项更少，运行更加稳定，效率更高，同时支持指定编号。
 4 | 
 5 | ## 运行
 6 | client.py提供了一些底层的方法与树洞api交互，而app.py提供了一些集成的方法来批量获得数据。
 7 | 
 8 | 主要功能位于`App.get_posts()`方法中，支持给定编号列表。
 9 | 
10 | 登录时可能会要求~~手机令牌或~~短信验证，跟随指引输入即可。
11 | 
12 | config.json
13 | ```json
14 | {
15 |     "username": "your_username",
16 |     "password": "your_password",
17 |     "secret_key": "your_secret_key"
18 | }
19 | ```
20 | `secret_key`用于自动生成手机令牌。可以通过绑定手机令牌时抓包获取。
21 | 
22 | ## 备注
23 | 由于现在直接多线程调用api，请务必减小单次爬取的条数，否则有被封号的危险。
24 | # v0
25 | （一个简易的）北大树洞爬虫，基于Selenium动态爬取网页内容。
26 | 
27 | 基于[luciusssss/PKUHoleCrawler: 北大树洞爬虫 (github.com)](https://github.com/luciusssss/PKUHoleCrawler)的改进，适用于新版本树洞[北大树洞 (pku.edu.cn)](https://treehole.pku.edu.cn)。目前支持Edge浏览器与Firefox浏览器。
28 | 
29 | ## 配置
30 | 
31 | 安装selenium
32 | 
33 | ```
34 | pip3 install selenium
35 | ```
36 | 
37 | 为了实现自动登录，需要拷贝浏览器的用户数据，Edge的用户数据默认位于`C:\Users\YourName\AppData\Local\Microsoft\Edge\User Data`(Windows)或`/home/YourName/.config/microsoft-edge/User Data`，Firefox的用户数据默认位于`C:\Users\YourName\AppData\Roaming\Mozilla\Firefox\Profiles\`(Windows)或`/home/你YourName/.mozilla/firefox/`(Linux)下的某个随机名称文件夹（例如`32fy5laa.default-release`），需要在原浏览器上保留登录状态（即访问https://treehole.pku.edu.cn不会跳转至登录界面）。
38 | 
39 | 浏览器对应的webdriver按需安装。
40 | 
41 | ## 运行
42 | 
43 | 使用`config.py`修改运行参数：
44 | 
45 | ```
46 | config.py [-h] [--crawl_size CRAWL_SIZE] [--part PART] [--browser BROWSER] [--profiles_path PROFILES_PATH]
47 | ```
48 | 
49 | 然后运行`run.py`：
50 | 
51 | ```
52 | python3 run.py
53 | ```
54 | 
55 | 爬取文本结果将按照`part`条树洞一组存储在`tree_hole_{part}_{start}-{end}({utc-time}).json`中；
56 | 
57 | 图片将存储于`download`文件夹下，格式为`image_{pid}.png`。
58 | 
59 | 命令行界面：
60 | 
61 | ```
62 | Log in successfully
63 | 第1部分： |██████████| 500/500 100.0%   总进度： |██████████| 2000/2000 100.0% 
64 | Crawling done
65 | Press any key to finish...
66 | ```
67 | 
68 | 
69 | 
70 | ## 备注
71 | 由于原先的架构依赖于selenium以及webdriver，稳定性较差，效率较低，现已重新写了一套代码。v0版的代码将不在维护。
72 | 
73 | 如果你希望对浏览过程有可视化的掌控，也可以在此基础上加以改进。
74 | 
75 | 


--------------------------------------------------------------------------------
/v0/config.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | import os
 3 | import argparse
 4 | 
 5 | class WebConfig:
 6 |     _default_config = {
 7 |         "browser": "chrome",
 8 |         "profiles_path": "/path/to/profiles",
 9 |         "crawl_size": 1000,
10 |         "part": 200,
11 |         "mode": "Normal"
12 |     }
13 | 
14 |     _config_file = "config.json"
15 | 
16 |     def __init__(self):
17 |         self.load_config()
18 | 
19 |     def load_config(self):
20 |         if os.path.exists(self._config_file):
21 |             with open(self._config_file, 'r') as f:
22 |                 self._config = json.load(f)
23 |         else:
24 |             self._config = self._default_config
25 |             self.save_config()
26 | 
27 |     def save_config(self):
28 |         with open(self._config_file, 'w') as f:
29 |             json.dump(self._config, f, indent=4)
30 | 
31 |     @property
32 |     def browser(self):
33 |         return self._config.get("browser", self._default_config["browser"])
34 | 
35 |     @browser.setter
36 |     def browser(self, value):
37 |         self._config["browser"] = value
38 |         self.save_config()
39 | 
40 |     @property
41 |     def profiles_path(self):
42 |         return self._config.get("profiles_path", self._default_config["profiles_path"])
43 | 
44 |     @profiles_path.setter
45 |     def profiles_path(self, value):
46 |         self._config["profiles_path"] = value
47 |         self.save_config()
48 | 
49 |     @property
50 |     def crawl_size(self):
51 |         return self._config.get("crawl_size", self._default_config["crawl_size"])
52 | 
53 |     @crawl_size.setter
54 |     def crawl_size(self, value):
55 |         self._config["crawl_size"] = value
56 |         self.save_config()
57 | 
58 |     @property
59 |     def part(self):
60 |         return self._config.get("part", self._default_config["part"])
61 | 
62 |     @part.setter
63 |     def part(self, value):
64 |         self._config["part"] = value
65 |         self.save_config()
66 | 
67 |     @property
68 |     def mode(self):
69 |         return self._config.get("mode", self._default_config["mode"])
70 | 
71 |     @mode.setter
72 |     def mode(self, value):
73 |         self._config["mode"] = value
74 |         self.save_config()
75 | 
76 | if __name__ == "__main__":
77 |     parse = argparse.ArgumentParser()
78 | 
79 |     parse.add_argument('--mode', choices=['Normal', 'Full', 'Specific'])
80 |     parse.add_argument('--crawl_size', type=int, default=1000)
81 |     parse.add_argument('--part', type=int, default=200)
82 |     parse.add_argument('--browser',choices=['Firefox', 'Edge'])
83 |     parse.add_argument('--profiles_path')
84 | 
85 |     args = vars(parse.parse_args())
86 | 
87 |     webconfig = WebConfig()
88 |     webconfig.browser = args['browser']
89 |     webconfig.profiles_path = args['profiles_path']
90 |     webconfig.crawl_size = args['crawl_size']
91 |     webconfig.part = args['part']


--------------------------------------------------------------------------------
/v1/app.py:
--------------------------------------------------------------------------------
  1 | from client import Client
  2 | import getpass
  3 | from concurrent.futures import ThreadPoolExecutor
  4 | import os
  5 | import datetime
  6 | import json
  7 | import pyotp
  8 | 
  9 | class App:
 10 |     def __init__(self):
 11 |         self.client = Client()
 12 |         self.executor = ThreadPoolExecutor(max_workers=20)
 13 |         self.current_dir = os.path.dirname(os.path.abspath(__file__))
 14 |         if not os.path.exists(os.path.join(self.current_dir, 'data', 'download')):
 15 |             os.makedirs(os.path.join(self.current_dir, 'data', 'download'))
 16 | 
 17 |         with open("config.json", encoding="utf-8") as file:
 18 |             data = json.load(file)
 19 |         
 20 |         self.username = data["username"] if "username" in data else None
 21 |         self.password = data["password"] if "password" in data else None
 22 |         self.secret_key = data["secret_key"] if "secret_key" in data else None
 23 | 
 24 |         response = self.client.un_read()
 25 |         while response.status_code != 200:
 26 |             print(f"{response.status_code}: 需要登录")
 27 |             if self.username and self.password:
 28 |                 username = self.username
 29 |                 password = self.password
 30 |             else:
 31 |                 username = input('username: ')
 32 |                 password = getpass.getpass('password: ')
 33 |             token = self.client.oauth_login(username, password)["token"]
 34 |             self.client.sso_login(token)
 35 |             response = self.client.un_read()
 36 | 
 37 |         while not response.json()["success"]:
 38 |             if response.json()["message"] == "请手机短信验证":
 39 |                 tmp = input("发送验证码(Y/n)：")
 40 |                 if tmp == 'Y':
 41 |                     self.client.send_message()
 42 |                     code = input("短信验证码：")
 43 |                     self.client.login_by_message(code)
 44 |             elif response.json()["message"] == "请进行令牌验证":
 45 |                 if self.secret_key:
 46 |                     totp = pyotp.TOTP(self.secret_key)
 47 |                     token = totp.now()
 48 |                     print(f"自动生成OTP令牌：{token}")
 49 |                 else:
 50 |                     token = input("手机令牌：")
 51 |                 self.client.login_by_token(token)
 52 |             response = self.client.un_read()
 53 |         self.client.save_cookies()
 54 | 
 55 |     def browse(self, page=1, limit=25):
 56 |         response = self.client.search(page=page, limit=limit)
 57 |         posts = response.json()["data"]["data"]
 58 |         return posts
 59 | 
 60 |     def read(self, post_id):
 61 |         post = self.client.get_post(post_id)
 62 |         if post["success"]:
 63 |             post = post["data"]
 64 | 
 65 |             reply = post["reply"]
 66 |             likenum = post["likenum"]
 67 |             text = post["text"]
 68 |             print(f"{post_id}  reply:{reply} likenum:{likenum}")
 69 |             print(text)
 70 |         else:
 71 |             print(f"{post_id}: {post["message"]}")
 72 | 
 73 |     def get_post(self, post_id):
 74 |         post = self.client.get_post(post_id)
 75 |         if post["success"]:
 76 |             post = post["data"]
 77 |             if post["type"] == "image":
 78 |                 image_type = post["url"].split(".")[-1]
 79 |                 self.client.get_image(post_id, os.path.join(self.current_dir, 'data', 'download', post_id) + "." + image_type)
 80 |             comments = self.client.get_comment(post_id)["data"]
 81 | 
 82 |             if comments:
 83 |                 last_page = comments["last_page"]
 84 |                 for page in range(2, last_page + 1):
 85 |                     part_comments = self.client.get_comment(post_id, page)["data"]
 86 |                     comments["data"] += part_comments["data"]
 87 |                 comments = comments["data"]
 88 |             else:
 89 |                 comments = []
 90 |             return post, comments
 91 |         else:
 92 |             return {'pid': int(post_id), 'text': '您查看的树洞不存在', 'type': 'text'}, []
 93 | 
 94 |     def get_posts(self, posts):
 95 |         posts_data = []
 96 |         post_ids = []
 97 |         futures = [self.executor.submit(lambda post_id=post_id: self.get_post(post_id)) for post_id in posts]
 98 |         for future in futures:
 99 |             post, comments = future.result()
100 |             posts_data.append({"post": post, "comments": comments})
101 |             post_ids.append(post["pid"])
102 |         current_dir = os.path.dirname(os.path.abspath(__file__))
103 |         data_name = f'{max(post_ids)}-{min(post_ids)}.json'
104 |         with open(os.path.join(current_dir, 'data', data_name), 'w', encoding='utf-8') as file:
105 |             json.dump(posts_data, file, indent=4, ensure_ascii=False)
106 | 
107 | 
108 | 
109 | if __name__ == "__main__":
110 |     app = App()
111 |     while True:
112 |         post_id = input("post id: ")
113 |         app.read(post_id)
114 | 
115 | 
116 | 
117 | 
118 | 
119 | 


--------------------------------------------------------------------------------
/v1/client.py:
--------------------------------------------------------------------------------
  1 | import requests
  2 | import enum
  3 | import random
  4 | import re
  5 | import os
  6 | import json
  7 | import uuid
  8 | from http.cookiejar import Cookie
  9 | 
 10 | class TreeHoleWeb(enum.Enum):
 11 |     OAUTH_LOGIN = "https://iaaa.pku.edu.cn/iaaa/oauthlogin.do"
 12 |     REDIR_URL = "https://treehole.pku.edu.cn/cas_iaaa_login?uuid=fc71db5799cf&plat=web"
 13 |     SSO_LOGIN = "http://treehole.pku.edu.cn/cas_iaaa_login"
 14 |     UN_READ = "https://treehole.pku.edu.cn/api/mail/un_read"
 15 |     SEARCH = "https://treehole.pku.edu.cn/api/pku_hole"
 16 |     COMMENT = "https://treehole.pku.edu.cn/api/pku_comment_v3"
 17 |     FOLLOW = "https://treehole.pku.edu.cn/api/pku_attention"
 18 |     GET_FOLLOW = "https://treehole.pku.edu.cn/api/follow_v2"
 19 |     REPORT = "https://treehole.pku.edu.cn/api/pku_comment/report"
 20 |     LOGIN_BY_TOKEN = "https://treehole.pku.edu.cn/api/login_iaaa_check_token"
 21 |     LOGIN_BY_MESSAGE = "https://treehole.pku.edu.cn/api/jwt_msg_verify"
 22 |     SEND_MESSAGE = "https://treehole.pku.edu.cn/api/jwt_send_msg"
 23 |     COURSE_TABLE = "https://treehole.pku.edu.cn/api/getCoursetable_v2"
 24 |     GRADE = "https://treehole.pku.edu.cn/api/course/score_v2"
 25 | 
 26 | 
 27 | class Client:
 28 |     def __init__(self):
 29 |         self.session = requests.Session()
 30 |         self.session.headers.update({
 31 |             "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/135.0.0.0 Safari/537.36 Edg/135.0.0.0"
 32 |         })
 33 |         self.load_cookies()
 34 |         if "pku_token" in self.session.cookies.keys():
 35 |             self.authorization = self.session.cookies.values()[self.session.cookies.keys().index("pku_token")]
 36 |             self.session.headers.update({"authorization": f"Bearer {self.authorization}"})
 37 | 
 38 |     def oauth_login(self, username, password):
 39 |         response = self.session.post(TreeHoleWeb.OAUTH_LOGIN.value, data={
 40 |             'appid': "PKU Helper",
 41 |             'userName': username,
 42 |             'password': password,
 43 |             'randCode': '',
 44 |             'smsCode': '',
 45 |             'otpCode': '',
 46 |             'redirUrl': TreeHoleWeb.REDIR_URL.value
 47 |         })
 48 |         response.raise_for_status()
 49 |         return response.json()
 50 |     
 51 |     def sso_login(self, token):
 52 |         rand = str(random.random())
 53 |         response = self.session.get(TreeHoleWeb.SSO_LOGIN.value, params={
 54 |             'uuid': str(uuid.uuid4()).split("-")[-1],
 55 |             'plat': "web",
 56 |             '_rand': rand,
 57 |             'token': token
 58 |             })
 59 |         response.raise_for_status()
 60 |         print(response.status_code, response.headers)
 61 | 
 62 |         self.authorization = re.search(r'token=(.*)', response.url).group(1)
 63 |         self.session.cookies.update({"pku_token": self.authorization})
 64 |         self.session.headers.update({"authorization": f"Bearer {self.authorization}"})
 65 |         return response
 66 |     
 67 |     def un_read(self):
 68 |         response = self.session.get(TreeHoleWeb.UN_READ.value)
 69 | 
 70 |         return response
 71 |     
 72 |     def login_by_token(self, token):
 73 |         response = self.session.post(TreeHoleWeb.LOGIN_BY_TOKEN.value, data={'code': token})
 74 |         response.raise_for_status()
 75 |         print(response.status_code, response.json())
 76 |         return response
 77 |     
 78 |     def login_by_message(self, code):
 79 |         response = self.session.post(TreeHoleWeb.LOGIN_BY_MESSAGE.value, data={'valid_code': code})
 80 |         response.raise_for_status()
 81 |         print(response.status_code, response.json())
 82 |         return response
 83 |     
 84 |     def send_message(self):
 85 |         response = self.session.post(TreeHoleWeb.SEND_MESSAGE.value)
 86 |         response.raise_for_status()
 87 |         return response
 88 |     
 89 |     def get_post(self, post_id):
 90 |         response = self.session.get(f"https://treehole.pku.edu.cn/api/pku/{post_id}")
 91 |         response.raise_for_status()
 92 |         return response.json()
 93 |     
 94 |     def get_comment(self, post_id, page=1, limit=15, sort="asc"):
 95 |         response = self.session.get(f"https://treehole.pku.edu.cn/api/pku_comment_v3/{post_id}", params={
 96 |             "page": page,
 97 |             "limit": limit,
 98 |             "sort": sort
 99 |         })
100 |         response.raise_for_status()
101 |         return response.json()
102 |     
103 |     def get_image(self, post_id, file_name):
104 |         response = self.session.get(f"https://treehole.pku.edu.cn/api/pku_image/{post_id}", stream=True)
105 |         if response.status_code == 200:
106 |             with open(f"{file_name}", "wb") as file:
107 |                 for chunk in response.iter_content(1024):
108 |                     file.write(chunk)
109 | 
110 |     def search(self, keyword=None, page=1, limit=25, label=None):
111 |         response = self.session.get(TreeHoleWeb.SEARCH.value, params={
112 |             "page": page,
113 |             "limit": limit,
114 |             "keyword": keyword,
115 |             "label": label
116 |         })
117 |         return response
118 |     
119 |     def follow(self, post_id):
120 |         response = self.session.post(TreeHoleWeb.FOLLOW.value + f"/{post_id}")
121 |         return response
122 |     
123 |     def get_follow(self, page=1, limit=25):
124 |         response = self.session.get(TreeHoleWeb.GET_FOLLOW.value, params={
125 |             "page": page,
126 |             "limit": limit
127 |         })
128 |         return response
129 |     
130 |     def comment(self, post_id, text, comment_id=None):
131 |         response = self.session.post(TreeHoleWeb.COMMENT.value, data={
132 |             "comment_id": comment_id,
133 |             "pid": post_id,
134 |             "text": text
135 |         } if comment_id else {
136 |             "pid": post_id,
137 |             "text": text
138 |         })
139 |         return response
140 |     
141 |     def report(self, tp, xid, other, reason):
142 |         if tp == 'post':
143 |             post_id = xid
144 |             response = self.session.post(TreeHoleWeb.REPORT.value + f"/{post_id}", data={
145 |                 "other": other,
146 |                 "reason": reason
147 |             })
148 |         elif tp == 'comment':
149 |             comment_id = xid
150 |             response = self.session.post(TreeHoleWeb.REPORT.value, data={
151 |                 "cid": comment_id,
152 |                 "other": other,
153 |                 "reason": reason
154 |             })
155 |         return response
156 |     
157 |     def get_course_table(self):
158 |         response = self.session.get(TreeHoleWeb.COURSE_TABLE.value)
159 |         return response
160 |     
161 |     def get_grade(self):
162 |         response = self.session.get(TreeHoleWeb.GRADE.value)
163 |         return response
164 |     
165 |     def save_cookies(self):
166 |         cookies_list = []
167 |         for cookie in self.session.cookies:
168 |                 cookie_dict = {
169 |                     'name': cookie.name,
170 |                     'value': cookie.value,
171 |                     'domain': cookie.domain,
172 |                     'path': cookie.path,
173 |                     'expires': cookie.expires if cookie.expires else None,
174 |                     'secure': cookie.secure,
175 |                     'rest': {'HttpOnly': cookie.has_nonstandard_attr('HttpOnly')}
176 |                 }
177 |                 cookies_list.append(cookie_dict)
178 | 
179 |         current_path = os.path.abspath(__file__)
180 |         cookie_path = os.path.join(os.path.dirname(current_path), "cookies.json")
181 |         with open(cookie_path, 'w') as f:
182 |             json.dump(cookies_list, f, indent=4)
183 | 
184 |     def load_cookies(self):
185 |         current_path = os.path.abspath(__file__)
186 |         cookie_path = os.path.join(os.path.dirname(current_path), "cookies.json")
187 |         try:
188 |             with open(cookie_path, 'r') as f:
189 |                 cookies_list = json.load(f)
190 |             self.session.cookies.clear()
191 |             for cookie_dict in cookies_list:
192 |                 cookie = Cookie(
193 |                     version=0,
194 |                     name=cookie_dict['name'],
195 |                     value=cookie_dict['value'],
196 |                     port=None,
197 |                     port_specified=False,
198 |                     domain=cookie_dict['domain'],
199 |                     domain_specified=bool(cookie_dict['domain']),
200 |                     domain_initial_dot=cookie_dict['domain'].startswith('.'),
201 |                     path=cookie_dict['path'],
202 |                     path_specified=bool(cookie_dict['path']),
203 |                     secure=cookie_dict['secure'],
204 |                     expires=cookie_dict['expires'],
205 |                     discard=False,
206 |                     comment=None,
207 |                     comment_url=None,
208 |                     rest=cookie_dict['rest']
209 |                 )
210 |                 self.session.cookies.set_cookie(cookie)
211 | 
212 |         except Exception as e:
213 |             print(e)
214 | 


--------------------------------------------------------------------------------
/v0/run.py:
--------------------------------------------------------------------------------
  1 | import sys
  2 | import os
  3 | import time
  4 | import datetime
  5 | import json
  6 | import re
  7 | import base64
  8 | import shutil
  9 | import psutil
 10 | import logging
 11 | 
 12 | 
 13 | from selenium import webdriver
 14 | from selenium.webdriver.chrome.options import Options
 15 | from selenium.webdriver.firefox.options import Options as FirefoxOptions
 16 | from selenium.webdriver.firefox.service import Service
 17 | from selenium.webdriver.firefox.firefox_profile import FirefoxProfile
 18 | from selenium.webdriver.edge.options import Options as EdgeOptions
 19 | from selenium.webdriver.edge.service import Service
 20 | 
 21 | from selenium.webdriver.common.by import By
 22 | from selenium.webdriver.common.action_chains import ActionChains
 23 | from selenium.webdriver.common.keys import Keys
 24 | from selenium.webdriver.support import expected_conditions as EC
 25 | from selenium.webdriver.support.ui import WebDriverWait
 26 | import time
 27 | from post import Post, Reply
 28 | from config import WebConfig
 29 | 
 30 | 
 31 | def print_progress(iteration, total, prefix, suffix, decimals, length, fill='█'):
 32 |     terminal_size = shutil.get_terminal_size()
 33 |     max_length = terminal_size.columns
 34 |     out = '\r'
 35 |     p = True
 36 |     for i in range(len(iteration)):
 37 |         percent = (
 38 |             "{0:." + str(decimals[i]) + "f}").format(100 * (iteration[i] / float(total[i])))
 39 |         filled_length = int(length[i] * iteration[i] // total[i])
 40 |         bar = fill * filled_length + '-' * (length[i] - filled_length)
 41 |         out += f'{prefix[i]} |{bar}| {iteration[i]}/{total[i]} {percent}% {suffix[i]}'
 42 |         p = p and iteration[i] >= total[i]
 43 | 
 44 |     sys.stdout.write(out[:max_length-2])
 45 |     sys.stdout.flush()
 46 |     if p:
 47 |         print()
 48 | 
 49 | 
 50 | def scroll_element(driver, element):
 51 |     element_location = element.location_once_scrolled_into_view
 52 |     element_height = element.size['height']
 53 |     scroll_y = element_location['y'] + element_height
 54 |     driver.execute_script("""
 55 |         window.scrollTo({
 56 |                           top: arguments[0],
 57 |                           behavior: 'smooth'
 58 |                           });
 59 |     """, scroll_y)
 60 | 
 61 | 
 62 | def save_html(driver, file_name='index.html'):
 63 |     html = driver.page_source
 64 |     with open(file_name, 'w', encoding='utf-8') as file:
 65 |         file.write(html)
 66 | 
 67 | 
 68 | def convert_posts_to_json(posts, file_name='output.json'):
 69 |     # print('Saving into json...')
 70 |     output = []
 71 |     for post in posts:
 72 |         output.append({
 73 |             'id': post.id,
 74 |             'likenum': post.likenum,
 75 |             'badge': post.badge,
 76 |             'content': post.content,
 77 |             'time': str(post.time),
 78 |             'quote': post.quote,
 79 |             'replies': [
 80 |                 {
 81 |                     'id': reply.id,
 82 |                     'name': reply.name,
 83 |                     'content': reply.content,
 84 |                     'time': str(reply.time),
 85 |                     'quote': reply.quote
 86 |                 }
 87 |                 for reply in post.replies
 88 |             ],
 89 |             'tip': post.tip
 90 |         })
 91 |     current_dir = os.path.dirname(os.path.abspath(__file__))
 92 |     if not os.path.exists(os.path.join(current_dir, 'data')):
 93 |         os.makedirs(os.path.join(current_dir, 'data'))
 94 |     file_path = os.path.join(current_dir, 'data', file_name)
 95 |     json.dump(output, open(file_path, 'w', encoding='utf-8'),
 96 |               ensure_ascii=False, indent=2)
 97 | 
 98 | 
 99 | def get_image(box_content, image_name):
100 |     img_element = box_content.find_element(
101 |         By.XPATH, ".//p[@class='img']/a/img[starts-with(@src, 'blob:')]")
102 |     result = driver.execute_async_script("""
103 |         var img = arguments[0];
104 |         var callback = arguments[1];
105 |         var xhr = new XMLHttpRequest();
106 |         xhr.open('GET', img.src, true);
107 |         xhr.responseType = 'blob';
108 |         xhr.onload = function(e) {
109 |             if (this.status == 200) {
110 |                 var reader = new FileReader();
111 |                 reader.onloadend = function() {
112 |                     callback(reader.result);
113 |                 }
114 |                 reader.readAsDataURL(this.response);
115 |             } else {
116 |                 callback(null);
117 |             }
118 |         };
119 |         xhr.send();
120 |     """, img_element)
121 |     if result and 'data:image' in result:
122 |         current_dir = os.path.dirname(os.path.abspath(__file__))
123 |         if not os.path.exists(os.path.join(current_dir, 'data', 'download')):
124 |             os.makedirs(os.path.join(current_dir, 'data', 'download'))
125 |         image_path = os.path.join(current_dir, 'data', 'download', image_name)
126 |         image_data = result.split(',')[1]
127 |         with open(image_path, 'wb') as f:
128 |             f.write(base64.b64decode(image_data))
129 |         # print("download a image")
130 |     else:
131 |         pass
132 |         # print("cannot download")
133 | 
134 | 
135 | def extract_post(post_tree, crawled_pids):
136 |     try:
137 |         pid = post_tree.find_element(
138 |             By.XPATH, ".//div[@class='flow-item']//code[@class='box-id --box-id-copy-content']").get_attribute('textContent').strip()
139 | 
140 |         if pid in crawled_pids:
141 |             return None
142 |         else:
143 |             crawled_pids.add(pid)
144 |         try:
145 |             pquote = post_tree.find_element(
146 |                 By.XPATH, ".//div[@class='flow-item  flow-item-quote']/div[@class='box']/div[@class='box-header']//code[@class='box-id --box-id-copy-content']").get_attribute('textContent').strip()
147 |         except:
148 |             pquote = None
149 |         try:
150 |             plikenum = post_tree.find_element(
151 |                 By.XPATH, ".//div[@class='flow-item']//span[@class='box-header-badge likenum']").get_attribute('textContent').strip()
152 |         except:
153 |             plikenum = 0
154 |         try:
155 |             pbadge = post_tree.find_element(
156 |                 By.XPATH, ".//div[@class='flow-item']//span[@class='box-header-badge']").get_attribute('textContent').strip()
157 |         except:
158 |             pbadge = 0
159 |         pcontent_fold_body = post_tree.find_element(
160 |             By.XPATH, ".//div[@class='flow-item']//div[@class='box-content']//div[@class='content-fold-body']")
161 |         pcontent = pcontent_fold_body.get_attribute('textContent')
162 |         try:
163 |             get_image(pcontent_fold_body, f'image_{pid}.png')
164 |             pimage = True
165 |         except:
166 |             pimage = False
167 |         ptime = post_tree.find_element(
168 |             By.XPATH, ".//div[@class='flow-item']//div[@class='box-header']").get_attribute('textContent').strip()
169 |         ptime = re.search(r'\d{2}-\d{2}\s\d{2}:\d{2}', ptime).group()
170 | 
171 |         new_post = Post(pid, plikenum, pbadge, pcontent, ptime, pquote, pimage)
172 |     except Exception as e:
173 |         html_code = driver.execute_script(
174 |             "return arguments[0].outerHTML;", post_tree)
175 |         logger.error(f'cannot extract post: {e} \n html code:\n{html_code}')
176 |         # print(e)
177 |         return None
178 | 
179 |     post = post_tree.find_element(By.XPATH, ".//div[@class='flow-item']")
180 |     post = post.find_element(By.XPATH, "..")
181 |     try:
182 |         pbox_tip = post.find_element(
183 |             By.XPATH, ".//div[@class=box box-tip]").text
184 |         new_post.tip = pbox_tip
185 |     except:
186 |         new_post.tip = None
187 |     for reply_tree in post.find_elements(By.XPATH, ".//div[@class='flow-reply box dialog-hole-reply']"):
188 |         rid = reply_tree.find_element(
189 |             By.XPATH, ".//code[@class='box-id']").get_attribute('textContent').strip()
190 | 
191 |         rtime_ = reply_tree.find_element(
192 |             By.XPATH, "./div[@class='box-header']")
193 |         rtime = rtime_.get_attribute('textContent').strip()
194 |         if not rtime:
195 |             html_code = driver.execute_script(
196 |                 "return arguments[0].outerHTML;", rtime_)
197 |             print(html_code)
198 |         rtime = re.search(r'\d{2}-\d{2}\s\d{2}:\d{2}', rtime).group()
199 |         # rtime = datetime.datetime.strptime(rtime,'%Y-%m-%dT%H:%M:%S')
200 |         rbox = reply_tree.find_element(By.XPATH, "./div[@class='box-content']")
201 |         try:
202 |             rquote = rbox.find_element(
203 |                 By.XPATH, "./div[contains(@class, 'quote')]").get_attribute('textContent').strip()
204 |         except:
205 |             rquote = None
206 |         rcontents = rbox.find_elements(By.XPATH, "./span")
207 |         name = rcontents[1].get_attribute('textContent')
208 |         if rquote:
209 |             quote_name = rcontents[-2].get_attribute('textContent')
210 |         else:
211 |             quote_name = None
212 |         rcontent = rcontents[-1].get_attribute('textContent')[2:]
213 |         if rquote:
214 |             new_post.add_reply(rid, name, rcontent, rtime,
215 |                                (quote_name, rquote))
216 |         else:
217 |             new_post.add_reply(rid, name, rcontent, rtime, None)
218 | 
219 |     return new_post
220 | 
221 | 
222 | def get_posts(driver, crawled_pids):
223 |     posts = []
224 |     post_trees = driver.find_elements(
225 |         By.XPATH, "//div[@class='flow-chunk']/div")
226 |     '''
227 |     driver.execute_script("""
228 |                           arguments[0].scrollIntoView({
229 |                           behavior: 'smooth',
230 |                           block: 'start'
231 |                           });
232 |                           """, post_trees[-1])
233 |     '''
234 |     for post_tree in post_trees:
235 |         # print('a new post')
236 |         new_post = extract_post(post_tree, crawled_pids)
237 |         if new_post != None:
238 |             posts.append(new_post)
239 |     scroll_element(driver, post_trees[-1])
240 |     for i in range(len(post_trees) - 3):
241 |         post_tree = post_trees[i]
242 |         try:
243 |             driver.execute_script("""
244 |                         arguments[0].parentNode.removeChild(arguments[0]);
245 |                     """, post_tree)
246 |         except:
247 |             logger.warning('cannot delete node')
248 | 
249 |     return posts
250 | 
251 | 
252 | if __name__ == '__main__':
253 |     webconfig = WebConfig()
254 |     logger = logging.getLogger()
255 |     logger.setLevel(logging.INFO)
256 |     log_name = f'{datetime.datetime.now(datetime.UTC).strftime("UTC%Y-%m-%d %H%M%S")}.log'
257 |     file_handler = logging.FileHandler(log_name, encoding='utf-8')
258 |     formatter = logging.Formatter('%(asctime)s - %(levelname)s - %(message)s')
259 |     file_handler.setFormatter(formatter)
260 |     logger.addHandler(file_handler)
261 |     mode = webconfig.mode
262 |     browser = webconfig.browser
263 |     profiles_path = webconfig.profiles_path
264 |     crawl_size = webconfig.crawl_size
265 |     part = webconfig.part
266 | 
267 |     logger.info(
268 |         f'browser={browser}, profiles_path={profiles_path}, crawl_size={crawl_size}, part={part}')
269 | 
270 |     if browser == 'Firefox':
271 |         options = FirefoxOptions()
272 |         firefox_profile = FirefoxProfile(profiles_path)
273 |         options.profile = firefox_profile
274 |         # options.add_argument("--headless")
275 |         driver = webdriver.Firefox(options=options)
276 |     elif browser == 'Edge':
277 |         options = EdgeOptions()
278 |         options.add_argument(rf'user-data-dir={profiles_path}')
279 |         # options.add_argument("--headless=new")
280 |         driver = webdriver.Edge(options=options)
281 |     driver.execute_script("""
282 |         Object.defineProperty(Navigator, 'webdriver', {get: () => undefined});
283 |         Object.defineProperty(navigator, 'webdriver', {get: () => false});               
284 |     """)
285 |     driver.get('https://treehole.pku.edu.cn')
286 |     time.sleep(1)
287 |     current_url = driver.current_url
288 |     if current_url == 'https://treehole.pku.edu.cn/web/verification':
289 |         content = driver.find_element(By.TAG_NAME, 'body').text
290 |         if '短信' in content:
291 |             logger.error('need message verification')
292 |         elif '手机令牌' in content:
293 |             logger.error('need mobile token')
294 |         else:
295 |             logger.error('unknown error')
296 |         print('Fail to log')
297 |         driver.close()
298 |     elif current_url.startswith('https://treehole.pku.edu.cn'):
299 |         logger.info('Log in successfully')
300 |         print('Log in successfully')
301 |         time.sleep(5)
302 |         # save_html(driver)
303 | 
304 |         posts = []
305 |         crawled_pids = set([])
306 |         i = 1
307 |         total_length = 0
308 |         memory_warn = False
309 |         timeout_warn = False
310 |         while (total_length < crawl_size):
311 |             memory = psutil.virtual_memory()
312 |             if not memory_warn and memory.available <= 524288000:
313 |                 memory_warn = True
314 |                 logger.warning(
315 |                     f'lack of available memory: {memory.available / (1024**2):.2f} MB')
316 |             elif memory.available <= 262144000:
317 |                 logger.error(
318 |                     f'serious lack of available memory: {memory.available / (1024**2):.2f} MB')
319 |                 break
320 |             new_posts = get_posts(driver, crawled_pids)
321 |             if not timeout_warn:
322 |                 if len(new_posts) == 0:
323 |                     t_start = time.time()
324 |                     timeout_warn = True
325 |             else:
326 |                 if len(new_posts) == 0:
327 |                     t_end = time.time()
328 |                     if t_end - t_start >= 20:
329 |                         logger.error(f'time out: {(t_end - t_start):.2f}s')
330 |                         break
331 |                 else:
332 |                     timeout_warn = False
333 | 
334 |             posts += new_posts
335 |             if len(posts) >= part:
336 |                 start = posts[0].id
337 |                 end = posts[part - 1].id
338 |                 now = datetime.datetime.now(datetime.UTC)
339 |                 now = now.strftime("UTC%Y-%m-%d %H%M%S")
340 |                 json_name = f'tree_hole_{part}_{start}-{end}({now}).json'
341 |                 convert_posts_to_json(
342 |                     posts[:part], file_name=json_name)
343 |                 total_length += part
344 |                 logger.info(f'part {i} done: {total_length}/{crawl_size}')
345 |                 i += 1
346 |                 posts = posts[part:]
347 |             print_progress((len(posts), total_length), (part, crawl_size),
348 |                            (f'第{i}部分：', '总进度：'), ('  ', ''), (1, 1), (10, 10))
349 |             time.sleep(0.1)
350 |         logger.info('crawling done')
351 |         print('Crawling done')
352 |         input("Press any key to finish...")
353 |         driver.close()
354 | 
355 |     else:
356 |         logger.warning('fail to log, try again')
357 |         print('Fail to log')
358 |         # input('Press any key to quit...')
359 |         driver.close()


--------------------------------------------------------------------------------