├── requirements.txt ├── README.md ├── LOGGER.py ├── login.py └── main.py /requirements.txt: -------------------------------------------------------------------------------- 1 | openpyxl 2 | selenium 3 | requests 4 | rich 5 | lxml 6 | pillow 7 | 8 | 9 | 10 | -i https://pypi.tuna.tsinghua.edu.cn/simple -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # 小红书关键词帖子文章检索和收集爬虫程序 2 | 3 | 4 | ## 程序介绍 5 | 本程序使用`selenium`自动化测试框架结合`ChromeDriver`浏览器驱动,模拟用户登录小红书网站,根据关键词搜索帖子,并收集帖子中的文章链接,保存到本地文件中。 6 | 主要用于自动化数据抓取、解析和管理,通过模块化的日志系统跟踪项目的执行流程。 7 | 此项目采用了多个流行的Python库,使得项目在数据抓取和自动化任务中表现出色,适用于爬虫、数据处理和自动化任务场景。 8 | 9 | ## 项目优点 10 | - 高效自动化:通过selenium和requests实现自动化的网页登录、数据抓取及解析,大大提升了操作的效率。 11 | - 模块化设计:各功能模块独立,如LOGGER.py管理日志、login.py负责登录,模块之间松耦合,便于后续维护和扩展。 12 | - 易于配置:依赖文件requirements.txt包含了所有需要的库,且支持国内源安装,用户只需一条命令即可完成环境搭建。 13 | - 本地化日志管理:LOGGER.py模块提供丰富的日志记录,便于在不同环境中调试和跟踪项目运行状态。 14 | 15 | ## 程序功能 16 | 1. 模拟用户登录小红书网站 17 | 2. 根据关键词搜索帖子 18 | 3. 收集帖子中的文章数据 19 | 4. 保存文章信息到本地文件中 20 | 21 | ## 程序模块 22 | ``` 23 | ├── LOGGER.py # 日志模块 24 | ├── login.py # 登录相关模块 25 | ├── main.py # 主程序入口 26 | ├── README.md # 项目文档 27 | ├── requirements.txt # 依赖库 28 | ``` 29 | 30 | ## 环境安装 31 | 32 | Step1: 安装谷歌浏览器并下载ChromeDriver 33 | 34 | ``` 35 | https://googlechromelabs.github.io/chrome-for-testing/ 36 | ``` 37 | 38 | Step2: 安装依赖库 39 | ```bash 40 | pip install -r requirements.txt 41 | ``` 42 | 43 | Tips 推荐使用Python3.10版本 44 | 45 | ## 使用方法 46 | 47 | 1. 首先需要运行`login.py`文件进行扫码登录 48 | - 请注意,登录后需要手动扫码确认登录,登录成功后程序会自动退出。 49 | - 登录成功后,程序会成功用于保存登录状态。 50 | 51 | ```bash 52 | python login.py 53 | ``` 54 | 55 | 56 | 2. 然后运行`main.py`文件进行关键词帖子文章检索和收集 57 | 58 | ```bash 59 | python main.py 60 | ``` 61 | 62 | # 提示:软件运行过程中实时向目标Excel中写入数据,请不要打开Excel防止无法写入 63 | -------------------------------------------------------------------------------- /LOGGER.py: -------------------------------------------------------------------------------- 1 | # !/usr/bin/env python 2 | # coding: utf-8 3 | 4 | import logging 5 | from rich.console import Console 6 | from rich.logging import RichHandler 7 | from rich.theme import Theme 8 | 9 | custom_theme = Theme({ 10 | "log.time": "dim blue", 11 | "logging.level.debug": "cyan", 12 | "logging.level.info": "green", 13 | "logging.level.warning": "yellow", 14 | "logging.level.error": "bold red", 15 | "logging.level.critical": "reverse bold red", 16 | }) 17 | 18 | def configure_logger(logger_name: str = "default", debug: bool = False, log_file: str = None): 19 | console = Console(theme=custom_theme) 20 | rich_handler = RichHandler(console=console, show_time=True, show_path=False) 21 | formatter = logging.Formatter(fmt="%(asctime)s - %(name)s - %(levelname)s - %(message)s") 22 | rich_handler.setFormatter(formatter) 23 | handlers = [rich_handler] 24 | if log_file: 25 | file_handler = logging.FileHandler(log_file, mode='a', encoding='utf-8') 26 | file_handler.setFormatter(formatter) 27 | handlers.append(file_handler) 28 | 29 | if debug: 30 | logger_level = logging.DEBUG 31 | else: 32 | logger_level = logging.INFO 33 | 34 | logging.basicConfig( 35 | level=logger_level, 36 | handlers=handlers 37 | ) 38 | 39 | return logging.getLogger(logger_name) 40 | 41 | # Non strict singleton mode returns this logger 42 | class GetLogger: 43 | _instance = {} 44 | 45 | def __new__(cls, logger_name="rich", debug=True, log_file=None): 46 | if logger_name not in cls._instance: 47 | cls._instance[logger_name] = configure_logger(logger_name, debug, log_file) 48 | return cls._instance[logger_name] 49 | 50 | def __init__(self): 51 | pass 52 | -------------------------------------------------------------------------------- /login.py: -------------------------------------------------------------------------------- 1 | # !/usr/bin/env python 2 | # coding: utf-8 3 | 4 | import os 5 | import stat 6 | import time 7 | from selenium import webdriver 8 | from selenium.webdriver.common.by import By 9 | from selenium.webdriver.support.ui import WebDriverWait 10 | from selenium.webdriver.support import expected_conditions as EC 11 | from selenium.webdriver.chrome.options import Options 12 | from selenium.webdriver.chrome.service import Service 13 | from selenium.common.exceptions import TimeoutException 14 | 15 | from LOGGER import GetLogger 16 | 17 | logger = GetLogger(logger_name="XiaoHongShu", debug=False, log_file="XiaoHongShu.log") 18 | 19 | USERPATH = os.path.abspath("./userData") 20 | 21 | if not os.path.exists(USERPATH): 22 | os.mkdir(USERPATH) 23 | # 赋予所有用户完全访问权限 24 | os.chmod(USERPATH, stat.S_IRWXU | stat.S_IRWXG | stat.S_IRWXO) # 设置权限为 77 25 | 26 | chrome_options = Options() 27 | 28 | # 使用用户数据目录 29 | chrome_options.add_argument(f"--user-data-dir={USERPATH}") 30 | chrome_options.add_argument("--window-size=1280,720") 31 | service = Service('./chromedriver.exe') 32 | 33 | driver = webdriver.Chrome(options=chrome_options, service=service) 34 | 35 | # 定义常量 36 | LOGIN_CONTAINER_CLASS = 'login-container' 37 | LOGIN_BTN_CLASS = 'login-btn' 38 | USER_BTN_CLASS = 'user' 39 | XIAOHONGSHU_URL = 'https://www.xiaohongshu.com/explore' 40 | 41 | # 登录函数 42 | def login(): 43 | driver.get(XIAOHONGSHU_URL) 44 | try: 45 | WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.CLASS_NAME, LOGIN_CONTAINER_CLASS))) 46 | except TimeoutException: 47 | try: 48 | login_button = WebDriverWait(driver, 10).until(EC.element_to_be_clickable((By.CLASS_NAME, LOGIN_BTN_CLASS))) 49 | login_button.click() 50 | WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.CLASS_NAME, LOGIN_CONTAINER_CLASS))) 51 | except: 52 | logger.error("登录失败") 53 | return False 54 | logger.info("请使用手机扫码登录,请不要关闭登录页面否则会导致程序报错") 55 | 56 | while True: 57 | try: 58 | WebDriverWait(driver, 10).until(EC.invisibility_of_element_located((By.CLASS_NAME, LOGIN_CONTAINER_CLASS))) 59 | if driver.find_elements(By.CLASS_NAME, LOGIN_BTN_CLASS): 60 | logger.warning("不要关闭登录页面,请正常扫码登录!") 61 | driver.find_element(By.CLASS_NAME, LOGIN_BTN_CLASS).click() 62 | WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.CLASS_NAME, LOGIN_CONTAINER_CLASS))) 63 | elif driver.find_elements(By.CLASS_NAME, USER_BTN_CLASS): 64 | logger.info("登录成功!") 65 | return True 66 | else: 67 | logger.error("登录失败,发生了未知的错误!") 68 | return False 69 | except Exception as e: 70 | logger.info("页面已经刷新...5秒后重启登录") 71 | logger.info("如果您发现页面的二维码已经刷新请手动点击刷新") 72 | time.sleep(5) 73 | 74 | if __name__ == '__main__': 75 | if login(): 76 | logger.info("登录成功,这下可以直接运行App.py了") 77 | else: 78 | logger.error("登录失败,请检查您的网络连接和登录信息") -------------------------------------------------------------------------------- /main.py: -------------------------------------------------------------------------------- 1 | # !/usr/bin/env python 2 | # coding: utf-8 3 | 4 | import os 5 | 6 | from openpyxl import Workbook 7 | import openpyxl 8 | from openpyxl.utils import get_column_letter 9 | from openpyxl.styles import Font, PatternFill, Alignment 10 | 11 | import time 12 | from lxml import etree 13 | 14 | from selenium import webdriver 15 | from selenium.webdriver.common.by import By 16 | from selenium.webdriver.support.ui import WebDriverWait 17 | from selenium.webdriver.support import expected_conditions as EC 18 | from selenium.webdriver.common.keys import Keys 19 | from selenium.webdriver.chrome.options import Options 20 | from selenium.webdriver.common.action_chains import ActionChains 21 | from selenium.webdriver.chrome.service import Service 22 | from selenium.common.exceptions import TimeoutException,StaleElementReferenceException,ElementClickInterceptedException 23 | 24 | from LOGGER import GetLogger 25 | 26 | logger = GetLogger(logger_name="XiaoHongShu", debug=False, log_file="XiaoHongShu.log") 27 | 28 | # 设置 Chrome 选项以避免打印出过多日志 29 | chrome_options = Options() 30 | USERPATH = os.path.abspath("./userData") 31 | service = Service('./chromedriver.exe') 32 | chrome_options.add_argument(f"--user-data-dir={USERPATH}") 33 | chrome_options.add_argument("--headless=new") # 无头模式 34 | chrome_options.add_argument("--disable-gpu") 35 | chrome_options.add_argument("--window-size=1920,1080") 36 | # chrome_options.add_argument("--window-size=1280,720") 37 | 38 | 39 | chrome_options.add_argument("--disable-logging") 40 | chrome_options.add_argument("--log-level=3") 41 | # 浏览器伪装:模拟真实的用户在操作.防反爬 42 | chrome_options.add_argument("user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36") 43 | chrome_options.add_experimental_option("excludeSwitches", ["enable-automation"]) 44 | chrome_options.add_experimental_option("useAutomationExtension", False) 45 | # 初始化 WebDriver 46 | driver = webdriver.Chrome(options=chrome_options,service=service) 47 | 48 | LOGIN_CONTAINER_CLASS = 'login-container' 49 | LOGIN_BTN_CLASS = 'login-btn' 50 | USER_BTN_CLASS = 'user' 51 | 52 | XIAOHONGSHU_URL = 'https://www.xiaohongshu.com/explore' 53 | 54 | def search_page(keyword,total): 55 | logger.info("打开小红书页面中") 56 | driver.get(XIAOHONGSHU_URL) 57 | SEARCH_INPUT_ID = 'search-input' 58 | SEARCH_ICON_CLASS = 'search-icon' 59 | # 向搜索框发送关键字 60 | logger.info(f"搜索关键字[{keyword}]中") 61 | search_input_object = WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.ID, SEARCH_INPUT_ID))) 62 | search_input_object.send_keys(keyword) 63 | logger.info(f"输入关键字[{keyword}]完成") 64 | # 点击搜索按钮 65 | logger.info(f"点击搜索按钮中") 66 | search_input_button_object = WebDriverWait(driver, 10).until(EC.element_to_be_clickable((By.CLASS_NAME, SEARCH_ICON_CLASS))) 67 | search_input_button_object.click() 68 | logger.info(f"点击搜索按钮完成") 69 | time.sleep(2) 70 | # 点击筛选按钮 71 | FILTER_INCO_CLASS = 'filter' 72 | filter_area_object = WebDriverWait(driver,10).until(EC.element_to_be_clickable((By.CLASS_NAME, FILTER_INCO_CLASS))) 73 | # 悬浮至该按钮上 74 | ActionChains(driver).move_to_element(filter_area_object).perform() 75 | time.sleep(1) # 等待下拉菜单显示 76 | # 获取所有具有类名 'dropdown-items' 的元素 77 | dropdown_menus = driver.find_elements(By.CLASS_NAME, "dropdown-items") 78 | 79 | # 过滤出 display 样式为 inline-block 的元素(就是正常显示的那个) 80 | visible_menus = [ 81 | menu for menu in dropdown_menus 82 | if driver.execute_script("return window.getComputedStyle(arguments[0]).display;", menu) == 'inline-block' 83 | ] 84 | # 输出符合条件的元素数量及内容 85 | logger.info(f"找到 {len(visible_menus)} 个 display 为 inline-block 的 'dropdown-items' 元素") 86 | 87 | if len(visible_menus) == 0: 88 | logger.error("未找到任何可见的菜单") 89 | return 90 | 91 | # 获取第一个可见的菜单 92 | dropdown_menu = visible_menus[0] 93 | # 获取第一个可见的菜单中的所有