├── requirements.txt ├── README.md ├── LOGGER.py ├── login.py └── main.py /requirements.txt: -------------------------------------------------------------------------------- 1 | openpyxl 2 | selenium 3 | requests 4 | rich 5 | lxml 6 | pillow 7 | 8 | 9 | 10 | -i https://pypi.tuna.tsinghua.edu.cn/simple -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # 小红书关键词帖子文章检索和收集爬虫程序 2 | 3 | 4 | ## 程序介绍 5 | 本程序使用`selenium`自动化测试框架结合`ChromeDriver`浏览器驱动,模拟用户登录小红书网站,根据关键词搜索帖子,并收集帖子中的文章链接,保存到本地文件中。 6 | 主要用于自动化数据抓取、解析和管理,通过模块化的日志系统跟踪项目的执行流程。 7 | 此项目采用了多个流行的Python库,使得项目在数据抓取和自动化任务中表现出色,适用于爬虫、数据处理和自动化任务场景。 8 | 9 | ## 项目优点 10 | - 高效自动化:通过selenium和requests实现自动化的网页登录、数据抓取及解析,大大提升了操作的效率。 11 | - 模块化设计:各功能模块独立,如LOGGER.py管理日志、login.py负责登录,模块之间松耦合,便于后续维护和扩展。 12 | - 易于配置:依赖文件requirements.txt包含了所有需要的库,且支持国内源安装,用户只需一条命令即可完成环境搭建。 13 | - 本地化日志管理:LOGGER.py模块提供丰富的日志记录,便于在不同环境中调试和跟踪项目运行状态。 14 | 15 | ## 程序功能 16 | 1. 模拟用户登录小红书网站 17 | 2. 根据关键词搜索帖子 18 | 3. 收集帖子中的文章数据 19 | 4. 保存文章信息到本地文件中 20 | 21 | ## 程序模块 22 | ``` 23 | ├── LOGGER.py # 日志模块 24 | ├── login.py # 登录相关模块 25 | ├── main.py # 主程序入口 26 | ├── README.md # 项目文档 27 | ├── requirements.txt # 依赖库 28 | ``` 29 | 30 | ## 环境安装 31 | 32 | Step1: 安装谷歌浏览器并下载ChromeDriver 33 | 34 | ``` 35 | https://googlechromelabs.github.io/chrome-for-testing/ 36 | ``` 37 | 38 | Step2: 安装依赖库 39 | ```bash 40 | pip install -r requirements.txt 41 | ``` 42 | 43 | Tips 推荐使用Python3.10版本 44 | 45 | ## 使用方法 46 | 47 | 1. 首先需要运行`login.py`文件进行扫码登录 48 | - 请注意,登录后需要手动扫码确认登录,登录成功后程序会自动退出。 49 | - 登录成功后,程序会成功用于保存登录状态。 50 | 51 | ```bash 52 | python login.py 53 | ``` 54 | 55 | 56 | 2. 然后运行`main.py`文件进行关键词帖子文章检索和收集 57 | 58 | ```bash 59 | python main.py 60 | ``` 61 | 62 | # 提示:软件运行过程中实时向目标Excel中写入数据,请不要打开Excel防止无法写入 63 | -------------------------------------------------------------------------------- /LOGGER.py: -------------------------------------------------------------------------------- 1 | # !/usr/bin/env python 2 | # coding: utf-8 3 | 4 | import logging 5 | from rich.console import Console 6 | from rich.logging import RichHandler 7 | from rich.theme import Theme 8 | 9 | custom_theme = Theme({ 10 | "log.time": "dim blue", 11 | "logging.level.debug": "cyan", 12 | "logging.level.info": "green", 13 | "logging.level.warning": "yellow", 14 | "logging.level.error": "bold red", 15 | "logging.level.critical": "reverse bold red", 16 | }) 17 | 18 | def configure_logger(logger_name: str = "default", debug: bool = False, log_file: str = None): 19 | console = Console(theme=custom_theme) 20 | rich_handler = RichHandler(console=console, show_time=True, show_path=False) 21 | formatter = logging.Formatter(fmt="%(asctime)s - %(name)s - %(levelname)s - %(message)s") 22 | rich_handler.setFormatter(formatter) 23 | handlers = [rich_handler] 24 | if log_file: 25 | file_handler = logging.FileHandler(log_file, mode='a', encoding='utf-8') 26 | file_handler.setFormatter(formatter) 27 | handlers.append(file_handler) 28 | 29 | if debug: 30 | logger_level = logging.DEBUG 31 | else: 32 | logger_level = logging.INFO 33 | 34 | logging.basicConfig( 35 | level=logger_level, 36 | handlers=handlers 37 | ) 38 | 39 | return logging.getLogger(logger_name) 40 | 41 | # Non strict singleton mode returns this logger 42 | class GetLogger: 43 | _instance = {} 44 | 45 | def __new__(cls, logger_name="rich", debug=True, log_file=None): 46 | if logger_name not in cls._instance: 47 | cls._instance[logger_name] = configure_logger(logger_name, debug, log_file) 48 | return cls._instance[logger_name] 49 | 50 | def __init__(self): 51 | pass 52 | -------------------------------------------------------------------------------- /login.py: -------------------------------------------------------------------------------- 1 | # !/usr/bin/env python 2 | # coding: utf-8 3 | 4 | import os 5 | import stat 6 | import time 7 | from selenium import webdriver 8 | from selenium.webdriver.common.by import By 9 | from selenium.webdriver.support.ui import WebDriverWait 10 | from selenium.webdriver.support import expected_conditions as EC 11 | from selenium.webdriver.chrome.options import Options 12 | from selenium.webdriver.chrome.service import Service 13 | from selenium.common.exceptions import TimeoutException 14 | 15 | from LOGGER import GetLogger 16 | 17 | logger = GetLogger(logger_name="XiaoHongShu", debug=False, log_file="XiaoHongShu.log") 18 | 19 | USERPATH = os.path.abspath("./userData") 20 | 21 | if not os.path.exists(USERPATH): 22 | os.mkdir(USERPATH) 23 | # 赋予所有用户完全访问权限 24 | os.chmod(USERPATH, stat.S_IRWXU | stat.S_IRWXG | stat.S_IRWXO) # 设置权限为 77 25 | 26 | chrome_options = Options() 27 | 28 | # 使用用户数据目录 29 | chrome_options.add_argument(f"--user-data-dir={USERPATH}") 30 | chrome_options.add_argument("--window-size=1280,720") 31 | service = Service('./chromedriver.exe') 32 | 33 | driver = webdriver.Chrome(options=chrome_options, service=service) 34 | 35 | # 定义常量 36 | LOGIN_CONTAINER_CLASS = 'login-container' 37 | LOGIN_BTN_CLASS = 'login-btn' 38 | USER_BTN_CLASS = 'user' 39 | XIAOHONGSHU_URL = 'https://www.xiaohongshu.com/explore' 40 | 41 | # 登录函数 42 | def login(): 43 | driver.get(XIAOHONGSHU_URL) 44 | try: 45 | WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.CLASS_NAME, LOGIN_CONTAINER_CLASS))) 46 | except TimeoutException: 47 | try: 48 | login_button = WebDriverWait(driver, 10).until(EC.element_to_be_clickable((By.CLASS_NAME, LOGIN_BTN_CLASS))) 49 | login_button.click() 50 | WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.CLASS_NAME, LOGIN_CONTAINER_CLASS))) 51 | except: 52 | logger.error("登录失败") 53 | return False 54 | logger.info("请使用手机扫码登录,请不要关闭登录页面否则会导致程序报错") 55 | 56 | while True: 57 | try: 58 | WebDriverWait(driver, 10).until(EC.invisibility_of_element_located((By.CLASS_NAME, LOGIN_CONTAINER_CLASS))) 59 | if driver.find_elements(By.CLASS_NAME, LOGIN_BTN_CLASS): 60 | logger.warning("不要关闭登录页面,请正常扫码登录!") 61 | driver.find_element(By.CLASS_NAME, LOGIN_BTN_CLASS).click() 62 | WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.CLASS_NAME, LOGIN_CONTAINER_CLASS))) 63 | elif driver.find_elements(By.CLASS_NAME, USER_BTN_CLASS): 64 | logger.info("登录成功!") 65 | return True 66 | else: 67 | logger.error("登录失败,发生了未知的错误!") 68 | return False 69 | except Exception as e: 70 | logger.info("页面已经刷新...5秒后重启登录") 71 | logger.info("如果您发现页面的二维码已经刷新请手动点击刷新") 72 | time.sleep(5) 73 | 74 | if __name__ == '__main__': 75 | if login(): 76 | logger.info("登录成功,这下可以直接运行App.py了") 77 | else: 78 | logger.error("登录失败,请检查您的网络连接和登录信息") -------------------------------------------------------------------------------- /main.py: -------------------------------------------------------------------------------- 1 | # !/usr/bin/env python 2 | # coding: utf-8 3 | 4 | import os 5 | 6 | from openpyxl import Workbook 7 | import openpyxl 8 | from openpyxl.utils import get_column_letter 9 | from openpyxl.styles import Font, PatternFill, Alignment 10 | 11 | import time 12 | from lxml import etree 13 | 14 | from selenium import webdriver 15 | from selenium.webdriver.common.by import By 16 | from selenium.webdriver.support.ui import WebDriverWait 17 | from selenium.webdriver.support import expected_conditions as EC 18 | from selenium.webdriver.common.keys import Keys 19 | from selenium.webdriver.chrome.options import Options 20 | from selenium.webdriver.common.action_chains import ActionChains 21 | from selenium.webdriver.chrome.service import Service 22 | from selenium.common.exceptions import TimeoutException,StaleElementReferenceException,ElementClickInterceptedException 23 | 24 | from LOGGER import GetLogger 25 | 26 | logger = GetLogger(logger_name="XiaoHongShu", debug=False, log_file="XiaoHongShu.log") 27 | 28 | # 设置 Chrome 选项以避免打印出过多日志 29 | chrome_options = Options() 30 | USERPATH = os.path.abspath("./userData") 31 | service = Service('./chromedriver.exe') 32 | chrome_options.add_argument(f"--user-data-dir={USERPATH}") 33 | chrome_options.add_argument("--headless=new") # 无头模式 34 | chrome_options.add_argument("--disable-gpu") 35 | chrome_options.add_argument("--window-size=1920,1080") 36 | # chrome_options.add_argument("--window-size=1280,720") 37 | 38 | 39 | chrome_options.add_argument("--disable-logging") 40 | chrome_options.add_argument("--log-level=3") 41 | # 浏览器伪装:模拟真实的用户在操作.防反爬 42 | chrome_options.add_argument("user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36") 43 | chrome_options.add_experimental_option("excludeSwitches", ["enable-automation"]) 44 | chrome_options.add_experimental_option("useAutomationExtension", False) 45 | # 初始化 WebDriver 46 | driver = webdriver.Chrome(options=chrome_options,service=service) 47 | 48 | LOGIN_CONTAINER_CLASS = 'login-container' 49 | LOGIN_BTN_CLASS = 'login-btn' 50 | USER_BTN_CLASS = 'user' 51 | 52 | XIAOHONGSHU_URL = 'https://www.xiaohongshu.com/explore' 53 | 54 | def search_page(keyword,total): 55 | logger.info("打开小红书页面中") 56 | driver.get(XIAOHONGSHU_URL) 57 | SEARCH_INPUT_ID = 'search-input' 58 | SEARCH_ICON_CLASS = 'search-icon' 59 | # 向搜索框发送关键字 60 | logger.info(f"搜索关键字[{keyword}]中") 61 | search_input_object = WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.ID, SEARCH_INPUT_ID))) 62 | search_input_object.send_keys(keyword) 63 | logger.info(f"输入关键字[{keyword}]完成") 64 | # 点击搜索按钮 65 | logger.info(f"点击搜索按钮中") 66 | search_input_button_object = WebDriverWait(driver, 10).until(EC.element_to_be_clickable((By.CLASS_NAME, SEARCH_ICON_CLASS))) 67 | search_input_button_object.click() 68 | logger.info(f"点击搜索按钮完成") 69 | time.sleep(2) 70 | # 点击筛选按钮 71 | FILTER_INCO_CLASS = 'filter' 72 | filter_area_object = WebDriverWait(driver,10).until(EC.element_to_be_clickable((By.CLASS_NAME, FILTER_INCO_CLASS))) 73 | # 悬浮至该按钮上 74 | ActionChains(driver).move_to_element(filter_area_object).perform() 75 | time.sleep(1) # 等待下拉菜单显示 76 | # 获取所有具有类名 'dropdown-items' 的元素 77 | dropdown_menus = driver.find_elements(By.CLASS_NAME, "dropdown-items") 78 | 79 | # 过滤出 display 样式为 inline-block 的元素(就是正常显示的那个) 80 | visible_menus = [ 81 | menu for menu in dropdown_menus 82 | if driver.execute_script("return window.getComputedStyle(arguments[0]).display;", menu) == 'inline-block' 83 | ] 84 | # 输出符合条件的元素数量及内容 85 | logger.info(f"找到 {len(visible_menus)} 个 display 为 inline-block 的 'dropdown-items' 元素") 86 | 87 | if len(visible_menus) == 0: 88 | logger.error("未找到任何可见的菜单") 89 | return 90 | 91 | # 获取第一个可见的菜单 92 | dropdown_menu = visible_menus[0] 93 | # 获取第一个可见的菜单中的所有
  • 元素 94 | li_elements = dropdown_menu.find_elements(By.TAG_NAME, "li") 95 | # 遍历每个
  • ,查找其中包含文本 "最热" 的 96 | for li in li_elements: 97 | try: 98 | # 查找
  • 内的 标签 99 | span = li.find_element(By.TAG_NAME, "span") 100 | # 检查 的文本是否为 "最热" 101 | if span.text == "最热": 102 | # 点击找到的 "最热" 选项 103 | li.click() 104 | logger.info("成功点击 '最热' 选项") 105 | break 106 | except Exception as e: 107 | logger.warning(f"在
  • 元素中查找时出现问题: {e}") 108 | else: 109 | logger.error("未找到包含 '最热' 文本的
  • 标签") 110 | 111 | 112 | last_index = 0 # 上一次最后爬取的数据索引 113 | no_change_count = 0 # 记录没有变化的次数 114 | no_change_limit = 3 # 设定限制次数 115 | while last_index <= int(total - 1): 116 | if last_index == 0: 117 | logger.info("开始爬取数据中") 118 | else: 119 | logger.info(f"该翻页了! 起始数据{last_index + 1}条: 获取页面数据中") 120 | # 获取内容页面 121 | FEEDS_CONTAINER_CLASS = 'feeds-container' 122 | feed_container_object = WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.CLASS_NAME, FEEDS_CONTAINER_CLASS))) 123 | # 获取容器内的所有
    标签(代表每个帖子) 124 | sections = feed_container_object.find_elements(By.TAG_NAME, "section") 125 | # 检测数据变化的循环 126 | previous_last_index = last_index 127 | # 如果已经爬取了数据则需要对页面进行滚动加载操作 128 | # 由于数据不是一条条加载而是一次加载数个的所以要不断尝试 129 | if last_index != 0: 130 | logger.info(f"开始滚动页面") 131 | ActionChains(driver).scroll_by_amount(0, 600).perform() 132 | time.sleep(3) 133 | feed_container_object = WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.CLASS_NAME, FEEDS_CONTAINER_CLASS))) 134 | sections = feed_container_object.find_elements(By.TAG_NAME, "section") 135 | try: 136 | # 获取第一个和最后一个
    标签的数据索引 137 | first_section = sections[0].get_attribute("data-index") 138 | last_section = sections[-1].get_attribute("data-index") 139 | except StaleElementReferenceException: 140 | logger.warning("检测到 StaleElementReferenceException,重新获取 sections") 141 | sections = feed_container_object.find_elements(By.TAG_NAME, "section") 142 | first_section = sections[0].get_attribute("data-index") 143 | last_section = sections[-1].get_attribute("data-index") 144 | 145 | while int(first_section) > int(last_index): 146 | logger.warning(f"滑动超过预期,回滑") 147 | ActionChains(driver).scroll_by_amount(0, -200).perform() 148 | feed_container_object = WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.CLASS_NAME, FEEDS_CONTAINER_CLASS))) 149 | sections = feed_container_object.find_elements(By.TAG_NAME, "section") 150 | first_section = sections[0].get_attribute("data-index") 151 | last_section = sections[-1].get_attribute("data-index") 152 | 153 | logger.info(f"回滑完成,当前数据开始索引为{first_section},结束索引为{last_section}") 154 | last_index = get_container(keywords,total,sections,last_index) 155 | else: 156 | last_index = get_container(keywords,total,sections,last_index) 157 | 158 | 159 | # 检查是否数据变化 160 | if last_index == previous_last_index: 161 | no_change_count += 1 162 | if no_change_count >= no_change_limit: 163 | logger.info("数据已停止变化,结束爬取") 164 | break 165 | else: 166 | no_change_count = 0 # 重置计数器 167 | 168 | 169 | def get_container(keywords,total,sections,last_index): 170 | file = f"{keywords}.xlsx" 171 | # 获取内容页面 172 | FEEDS_CONTAINER_CLASS = 'feeds-container' 173 | feed_container_object = WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.CLASS_NAME, FEEDS_CONTAINER_CLASS))) 174 | time.sleep(2) 175 | logger.info(f"获取页面中卡片完成") 176 | 177 | # 动态获取刷新以对抗爬虫过程中页面元素更新 178 | for i in range(len(sections)): 179 | if last_index == total: 180 | logger.info(f"所有帖子爬取完毕") 181 | break 182 | try: 183 | logger.info(f"读取第{last_index}个帖子") 184 | # 重新定位当前 section,以防止 stale element 问题 185 | sections = feed_container_object.find_elements(By.TAG_NAME, "section") 186 | # 判断是否包含 data-width 和 data-height 属性,对抗广告 187 | data_index = sections[i].get_attribute("data-index") 188 | data_width = sections[i].get_attribute("data-width") 189 | data_height = sections[i].get_attribute("data-height") 190 | if int(data_index) < int(last_index): 191 | logger.info(f"本批次, 第 {i} 个帖子爬取了,跳过") 192 | continue 193 | if not data_width or not data_height: 194 | logger.info(f"第 {last_index} 个帖子为广告,跳过") 195 | last_index += 1 196 | tree = None 197 | else: 198 | logger.info(f"点击第{last_index}个帖子") 199 | try: 200 | sections[i].click() 201 | except ElementClickInterceptedException: 202 | logger.error(f"第 {last_index} 个帖子点击失败,可能是不兼容的直播或广告,跳过") 203 | tree = None 204 | last_index += 1 205 | get_content(tree,file) # 获取页面数据 206 | continue 207 | # 抓取帖子页面内容 208 | AUTHOR_AREA_CLASS = 'author-wrapper' 209 | TITLE_AREA_ID = 'detail-title' 210 | CONTENT_AREA_ID = 'detail-desc' 211 | BOTTOM_AREA_CLASS = 'bottom-container' 212 | INTERATE_AREA_CLASS = 'interact-container' 213 | logger.info("加载数据中") 214 | wait = WebDriverWait(driver, 10) 215 | # 等待页面的各个部分加载完成 216 | wait.until(EC.presence_of_element_located((By.CLASS_NAME, AUTHOR_AREA_CLASS))) 217 | logger.info("加载作者信息完成") 218 | try: 219 | wait.until(EC.presence_of_element_located((By.ID, TITLE_AREA_ID))) 220 | logger.info("加载标题完成") 221 | wait.until(EC.presence_of_element_located((By.ID, CONTENT_AREA_ID))) 222 | logger.info("加载内容完成") 223 | wait.until(EC.presence_of_element_located((By.CLASS_NAME, BOTTOM_AREA_CLASS))) 224 | logger.info("加载底部信息完成") 225 | except TimeoutException: 226 | logger.warning("出现内容为空,此消息作为警告消息提示") 227 | except Exception as e: 228 | logger.error(f"加载内容失败,错误信息:{e}") 229 | 230 | wait.until(EC.presence_of_element_located((By.CLASS_NAME, INTERATE_AREA_CLASS))) 231 | logger.info("加载具体浏览数据完成") 232 | # 截取页面数据 233 | logger.info("截取当前页面数据中...") 234 | page_content = driver.page_source 235 | tree = etree.HTML(page_content) 236 | 237 | time.sleep(2) 238 | # 发送 ESC 键关闭帖子 239 | logger.info("点击关闭按钮关闭帖子") 240 | CLOSS_BTN_CLASS = 'close-circle' 241 | close_btn = WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.CLASS_NAME, CLOSS_BTN_CLASS))) 242 | close_btn.click() 243 | logger.info("已关闭帖子") 244 | 245 | # 更新读取进度 246 | logger.info(f"读取第{last_index}个帖子完成") 247 | last_index += 1 248 | # 等待关闭后重新定位页面 249 | logger.info("重新定位页面中") 250 | WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.CLASS_NAME, FEEDS_CONTAINER_CLASS))) 251 | logger.info("重新定位页面完成") 252 | time.sleep(3) 253 | 254 | get_content(tree,file) # 获取页面数据 255 | except StaleElementReferenceException: 256 | logger.error(f"StaleElementReferenceException: 第{last_index}个帖子元素失效,重新尝试获取元素") 257 | # 重新获取页面的帖子列表并重试 258 | feed_container_object = WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.CLASS_NAME, FEEDS_CONTAINER_CLASS))) 259 | sections = feed_container_object.find_elements(By.TAG_NAME, "section") 260 | except IndexError: 261 | logger.warning(f"已经爬取到了所有帖子无法再爬取更多了,总数为{last_index}个") 262 | logger.info(f"爬取完成,爬取到{last_index}个帖子") 263 | return last_index 264 | 265 | def get_content(tree,file): 266 | data = {} 267 | if not tree: 268 | data["remark"] = "这是一个广告" 269 | write_to_excel(data,file) 270 | return 271 | 272 | # 获取用户名 273 | username_element = tree.xpath('//div[@class="author-wrapper"]//span[@class="username"]/text()') 274 | if username_element: 275 | data["username"] = username_element[0] 276 | 277 | # 获取标题 278 | title_element = tree.xpath('//div[@id="detail-title"]/text()') 279 | if title_element: 280 | data["title"] = title_element[0] 281 | 282 | # 获取内容 283 | content_element = tree.xpath('//div[@id="detail-desc"]//span[@class="note-text"]/span/text()') 284 | if content_element: 285 | data["content"] = content_element[0] 286 | 287 | # 获取标签 288 | tags_elements = tree.xpath('//a[@class="tag"]/text()') 289 | tags = [tag.strip() for tag in tags_elements] 290 | data["tags"] = tags 291 | 292 | # 获取发布时间和地点 293 | date_local_element = tree.xpath('//div[@class="bottom-container"]//span[@class="date"]/text()') 294 | if date_local_element: 295 | data["date_local"] = date_local_element[0] 296 | 297 | # 获取点赞数 298 | like_count_element = tree.xpath('//div[@class="interact-container"]/div/div//span[contains(@class, "like-wrapper")]//span[contains(@class, "count")]/text()') 299 | if like_count_element: 300 | data["like_count"] = like_count_element[0] 301 | if data["like_count"] == "点赞": 302 | data["like_count"] = 0 303 | 304 | 305 | # 获取收藏数 306 | collect_count_element = tree.xpath('//span[contains(@class, "collect-wrapper")]//span[contains(@class, "count")]/text()') 307 | if collect_count_element: 308 | data["collect_count"] = collect_count_element[0] 309 | if data["collect_count"] == "收藏": 310 | data["collect_count"] = 0 311 | 312 | # 获取评论数 313 | comment_count_element = tree.xpath('//span[contains(@class, "chat-wrapper")]//span[contains(@class, "count")]/text()') 314 | if comment_count_element: 315 | data["comment_count"] = comment_count_element[0] 316 | if data["comment_count"] == "评论": 317 | data["comment_count"] = 0 318 | 319 | # 将数据写入 Excel 320 | write_to_excel(data,file) 321 | 322 | def write_to_excel(data, filename="output.xlsx"): 323 | try: 324 | # 尝试打开已有文件,否则创建新文件 325 | wb = openpyxl.load_workbook(filename) 326 | ws = wb.active 327 | except FileNotFoundError: 328 | wb = Workbook() 329 | ws = wb.active 330 | # 写入表头 331 | headers = ["用户名", "标题", "内容", "标签", "发布时间和地点", "点赞数", "收藏数", "评论数", "备注"] 332 | ws.append(headers) 333 | # 设置表头样式 334 | for col_num, header in enumerate(headers, 1): 335 | cell = ws.cell(row=1, column=col_num) 336 | cell.font = Font(bold=True, color="FFFFFF") 337 | cell.fill = PatternFill(start_color="4F81BD", end_color="4F81BD", fill_type="solid") 338 | cell.alignment = Alignment(horizontal="center", vertical="center") 339 | ws.column_dimensions[get_column_letter(col_num)].width = 15 # 设置列宽 340 | 341 | 342 | # 写入数据到表格 343 | ws.append([ 344 | data.get("username", ""), 345 | data.get("title", ""), 346 | data.get("content", ""), 347 | ", ".join(data.get("tags", [])), 348 | data.get("date_local", ""), 349 | data.get("like_count", ""), 350 | data.get("collect_count", ""), 351 | data.get("comment_count", ""), 352 | data.get("remark", "") 353 | ]) 354 | 355 | # 保存文件 356 | wb.save(filename) 357 | logger.info(f"数据已写入 {filename}") 358 | 359 | if __name__ == '__main__': 360 | 361 | # 定义需要爬取的关键词 362 | keywords = "运动" 363 | total = 229 # 设置爬取的最大条数(如果设置的条数大于能爬到的最大条数则以能爬取的最大条数为准,一般小红书的最新页面展示的是229个) 364 | # 爬取关键词 365 | search_page(keywords,total) --------------------------------------------------------------------------------