标签（代表每个帖子） 124 | sections = feed_container_object.find_elements(By.TAG_NAME, "section") 125 | # 检测数据变化的循环 126 | previous_last_index = last_index 127 | # 如果已经爬取了数据则需要对页面进行滚动加载操作 128 | # 由于数据不是一条条加载而是一次加载数个的所以要不断尝试 129 | if last_index != 0: 130 | logger.info(f"开始滚动页面") 131 | ActionChains(driver).scroll_by_amount(0, 600).perform() 132 | time.sleep(3) 133 | feed_container_object = WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.CLASS_NAME, FEEDS_CONTAINER_CLASS))) 134 | sections = feed_container_object.find_elements(By.TAG_NAME, "section") 135 | try: 136 | # 获取第一个和最后一个

标签的数据索引 137 | first_section = sections[0].get_attribute("data-index") 138 | last_section = sections[-1].get_attribute("data-index") 139 | except StaleElementReferenceException: 140 | logger.warning("检测到 StaleElementReferenceException，重新获取 sections") 141 | sections = feed_container_object.find_elements(By.TAG_NAME, "section") 142 | first_section = sections[0].get_attribute("data-index") 143 | last_section = sections[-1].get_attribute("data-index") 144 | 145 | while int(first_section) > int(last_index): 146 | logger.warning(f"滑动超过预期,回滑") 147 | ActionChains(driver).scroll_by_amount(0, -200).perform() 148 | feed_container_object = WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.CLASS_NAME, FEEDS_CONTAINER_CLASS))) 149 | sections = feed_container_object.find_elements(By.TAG_NAME, "section") 150 | first_section = sections[0].get_attribute("data-index") 151 | last_section = sections[-1].get_attribute("data-index") 152 | 153 | logger.info(f"回滑完成,当前数据开始索引为{first_section},结束索引为{last_section}") 154 | last_index = get_container(keywords,total,sections,last_index) 155 | else: 156 | last_index = get_container(keywords,total,sections,last_index) 157 | 158 | 159 | # 检查是否数据变化 160 | if last_index == previous_last_index: 161 | no_change_count += 1 162 | if no_change_count >= no_change_limit: 163 | logger.info("数据已停止变化，结束爬取") 164 | break 165 | else: 166 | no_change_count = 0 # 重置计数器 167 | 168 | 169 | def get_container(keywords,total,sections,last_index): 170 | file = f"{keywords}.xlsx" 171 | # 获取内容页面 172 | FEEDS_CONTAINER_CLASS = 'feeds-container' 173 | feed_container_object = WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.CLASS_NAME, FEEDS_CONTAINER_CLASS))) 174 | time.sleep(2) 175 | logger.info(f"获取页面中卡片完成") 176 | 177 | # 动态获取刷新以对抗爬虫过程中页面元素更新 178 | for i in range(len(sections)): 179 | if last_index == total: 180 | logger.info(f"所有帖子爬取完毕") 181 | break 182 | try: 183 | logger.info(f"读取第{last_index}个帖子") 184 | # 重新定位当前 section，以防止 stale element 问题 185 | sections = feed_container_object.find_elements(By.TAG_NAME, "section") 186 | # 判断是否包含 data-width 和 data-height 属性，对抗广告 187 | data_index = sections[i].get_attribute("data-index") 188 | data_width = sections[i].get_attribute("data-width") 189 | data_height = sections[i].get_attribute("data-height") 190 | if int(data_index) < int(last_index): 191 | logger.info(f"本批次, 第 {i} 个帖子爬取了，跳过") 192 | continue 193 | if not data_width or not data_height: 194 | logger.info(f"第 {last_index} 个帖子为广告，跳过") 195 | last_index += 1 196 | tree = None 197 | else: 198 | logger.info(f"点击第{last_index}个帖子") 199 | try: 200 | sections[i].click() 201 | except ElementClickInterceptedException: 202 | logger.error(f"第 {last_index} 个帖子点击失败,可能是不兼容的直播或广告,跳过") 203 | tree = None 204 | last_index += 1 205 | get_content(tree,file) # 获取页面数据 206 | continue 207 | # 抓取帖子页面内容 208 | AUTHOR_AREA_CLASS = 'author-wrapper' 209 | TITLE_AREA_ID = 'detail-title' 210 | CONTENT_AREA_ID = 'detail-desc' 211 | BOTTOM_AREA_CLASS = 'bottom-container' 212 | INTERATE_AREA_CLASS = 'interact-container' 213 | logger.info("加载数据中") 214 | wait = WebDriverWait(driver, 10) 215 | # 等待页面的各个部分加载完成 216 | wait.until(EC.presence_of_element_located((By.CLASS_NAME, AUTHOR_AREA_CLASS))) 217 | logger.info("加载作者信息完成") 218 | try: 219 | wait.until(EC.presence_of_element_located((By.ID, TITLE_AREA_ID))) 220 | logger.info("加载标题完成") 221 | wait.until(EC.presence_of_element_located((By.ID, CONTENT_AREA_ID))) 222 | logger.info("加载内容完成") 223 | wait.until(EC.presence_of_element_located((By.CLASS_NAME, BOTTOM_AREA_CLASS))) 224 | logger.info("加载底部信息完成") 225 | except TimeoutException: 226 | logger.warning("出现内容为空,此消息作为警告消息提示") 227 | except Exception as e: 228 | logger.error(f"加载内容失败，错误信息：{e}") 229 | 230 | wait.until(EC.presence_of_element_located((By.CLASS_NAME, INTERATE_AREA_CLASS))) 231 | logger.info("加载具体浏览数据完成") 232 | # 截取页面数据 233 | logger.info("截取当前页面数据中...") 234 | page_content = driver.page_source 235 | tree = etree.HTML(page_content) 236 | 237 | time.sleep(2) 238 | # 发送 ESC 键关闭帖子 239 | logger.info("点击关闭按钮关闭帖子") 240 | CLOSS_BTN_CLASS = 'close-circle' 241 | close_btn = WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.CLASS_NAME, CLOSS_BTN_CLASS))) 242 | close_btn.click() 243 | logger.info("已关闭帖子") 244 | 245 | # 更新读取进度 246 | logger.info(f"读取第{last_index}个帖子完成") 247 | last_index += 1 248 | # 等待关闭后重新定位页面 249 | logger.info("重新定位页面中") 250 | WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.CLASS_NAME, FEEDS_CONTAINER_CLASS))) 251 | logger.info("重新定位页面完成") 252 | time.sleep(3) 253 | 254 | get_content(tree,file) # 获取页面数据 255 | except StaleElementReferenceException: 256 | logger.error(f"StaleElementReferenceException: 第{last_index}个帖子元素失效，重新尝试获取元素") 257 | # 重新获取页面的帖子列表并重试 258 | feed_container_object = WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.CLASS_NAME, FEEDS_CONTAINER_CLASS))) 259 | sections = feed_container_object.find_elements(By.TAG_NAME, "section") 260 | except IndexError: 261 | logger.warning(f"已经爬取到了所有帖子无法再爬取更多了，总数为{last_index}个") 262 | logger.info(f"爬取完成,爬取到{last_index}个帖子") 263 | return last_index 264 | 265 | def get_content(tree,file): 266 | data = {} 267 | if not tree: 268 | data["remark"] = "这是一个广告" 269 | write_to_excel(data,file) 270 | return 271 | 272 | # 获取用户名 273 | username_element = tree.xpath('//div[@class="author-wrapper"]//span[@class="username"]/text()') 274 | if username_element: 275 | data["username"] = username_element[0] 276 | 277 | # 获取标题 278 | title_element = tree.xpath('//div[@id="detail-title"]/text()') 279 | if title_element: 280 | data["title"] = title_element[0] 281 | 282 | # 获取内容 283 | content_element = tree.xpath('//div[@id="detail-desc"]//span[@class="note-text"]/span/text()') 284 | if content_element: 285 | data["content"] = content_element[0] 286 | 287 | # 获取标签 288 | tags_elements = tree.xpath('//a[@class="tag"]/text()') 289 | tags = [tag.strip() for tag in tags_elements] 290 | data["tags"] = tags 291 | 292 | # 获取发布时间和地点 293 | date_local_element = tree.xpath('//div[@class="bottom-container"]//span[@class="date"]/text()') 294 | if date_local_element: 295 | data["date_local"] = date_local_element[0] 296 | 297 | # 获取点赞数 298 | like_count_element = tree.xpath('//div[@class="interact-container"]/div/div//span[contains(@class, "like-wrapper")]//span[contains(@class, "count")]/text()') 299 | if like_count_element: 300 | data["like_count"] = like_count_element[0] 301 | if data["like_count"] == "点赞": 302 | data["like_count"] = 0 303 | 304 | 305 | # 获取收藏数 306 | collect_count_element = tree.xpath('//span[contains(@class, "collect-wrapper")]//span[contains(@class, "count")]/text()') 307 | if collect_count_element: 308 | data["collect_count"] = collect_count_element[0] 309 | if data["collect_count"] == "收藏": 310 | data["collect_count"] = 0 311 | 312 | # 获取评论数 313 | comment_count_element = tree.xpath('//span[contains(@class, "chat-wrapper")]//span[contains(@class, "count")]/text()') 314 | if comment_count_element: 315 | data["comment_count"] = comment_count_element[0] 316 | if data["comment_count"] == "评论": 317 | data["comment_count"] = 0 318 | 319 | # 将数据写入 Excel 320 | write_to_excel(data,file) 321 | 322 | def write_to_excel(data, filename="output.xlsx"): 323 | try: 324 | # 尝试打开已有文件，否则创建新文件 325 | wb = openpyxl.load_workbook(filename) 326 | ws = wb.active 327 | except FileNotFoundError: 328 | wb = Workbook() 329 | ws = wb.active 330 | # 写入表头 331 | headers = ["用户名", "标题", "内容", "标签", "发布时间和地点", "点赞数", "收藏数", "评论数", "备注"] 332 | ws.append(headers) 333 | # 设置表头样式 334 | for col_num, header in enumerate(headers, 1): 335 | cell = ws.cell(row=1, column=col_num) 336 | cell.font = Font(bold=True, color="FFFFFF") 337 | cell.fill = PatternFill(start_color="4F81BD", end_color="4F81BD", fill_type="solid") 338 | cell.alignment = Alignment(horizontal="center", vertical="center") 339 | ws.column_dimensions[get_column_letter(col_num)].width = 15 # 设置列宽 340 | 341 | 342 | # 写入数据到表格 343 | ws.append([ 344 | data.get("username", ""), 345 | data.get("title", ""), 346 | data.get("content", ""), 347 | ", ".join(data.get("tags", [])), 348 | data.get("date_local", ""), 349 | data.get("like_count", ""), 350 | data.get("collect_count", ""), 351 | data.get("comment_count", ""), 352 | data.get("remark", "") 353 | ]) 354 | 355 | # 保存文件 356 | wb.save(filename) 357 | logger.info(f"数据已写入 {filename}") 358 | 359 | if __name__ == '__main__': 360 | 361 | # 定义需要爬取的关键词 362 | keywords = "运动" 363 | total = 229 # 设置爬取的最大条数(如果设置的条数大于能爬到的最大条数则以能爬取的最大条数为准,一般小红书的最新页面展示的是229个) 364 | # 爬取关键词 365 | search_page(keywords,total) --------------------------------------------------------------------------------