├── requirements.txt ├── LICENSE ├── README.md ├── config.py ├── .gitignore ├── main.py ├── redis_client.py ├── pixiv_download.py ├── pixiv_spider.py └── redis_monitor.py /requirements.txt: -------------------------------------------------------------------------------- 1 | redis==5.2.1 2 | requests==2.32.3 3 | rich==13.7.1 4 | urllib3<2.0.0 # 确保与requests兼容 5 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2019 岛风 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | **一个Pixiv小爬虫,目前只可以爬每日, 支持长时间爬取 跳过已经爬过的** 2 | 3 | ![HapiGo_2024-12-20_12.39.49.png](https://img.nyaasuki.com/2024/12/20/6764f51f5fccf.png) 4 | 5 | ## 环境需求 6 | 7 | Python:3.8+ / Redis 8 | 9 | ## 食用方法 10 | 11 | **Linux/OSX:** 12 | 13 | ```shell 14 | git clone https://github.com/nyaasuki/PixivSpider.git && cd ./PixivSpider 15 | python3 main.py 16 | ``` 17 | 18 | **Windows:** 19 | 20 | 1. 下载/clone这个项目 21 | 22 | 2. 配置好环境(python、Redis) 23 | 24 | 3. 打开你的CMD窗口 25 | 26 | 4. 输入python+‘ ’ ←这是一个空格 27 | 28 | 5. 用鼠标把**main.py**这个文件拖到cmd窗口 29 | 30 | ​ ^_^ 31 | 32 | ## 注意事项 33 | 34 | 1.requests安装错误 35 | 36 | `ERROR: Could not find a version that satisfies the requirement resquests 37 | ERROR: No matching distribution found for resquests` 38 | 39 | 解决方案:手动安装requests 40 | 41 | 'pip install -i https://pypi.tuna.tsinghua.edu.cn/simple requests' 42 | 43 | 2.请输入一个cookie 44 | 45 | 目前此项留空直接回车也可以正常爬取(匿名模式),如果后续添加新功能可能需要 46 | 47 | 此项储存在本地redis中 48 | 49 | 3.错误:无法连接到Redis服务,请确保Redis服务正在运行 50 | 项目使用redis查重 需要安装redis 51 | 官方安装教程:https://redis.io/docs/latest/operate/oss_and_stack/install/install-redis/ 52 | 53 | ## 特别提醒 54 | 55 | 正常来说,当没有出现上方问题时,程序出现问题大多为你的上网方式不够科学 56 | 缓慢更新中... 57 | -------------------------------------------------------------------------------- /config.py: -------------------------------------------------------------------------------- 1 | """配置管理""" 2 | from typing import Dict, Any 3 | from dataclasses import dataclass 4 | 5 | @dataclass 6 | class RedisConfig: 7 | """Redis配置""" 8 | host: str = 'localhost' 9 | port: int = 6379 10 | max_connections: int = 10 11 | db_range: tuple = (0, 5) # 支持的数据库范围(包含) 12 | 13 | @dataclass 14 | class PixivConfig: 15 | """Pixiv API配置""" 16 | ajax_url: str = 'https://www.pixiv.net/ajax/illust/{}/pages' 17 | top_url: str = 'https://www.pixiv.net/ranking.php' 18 | user_agent: str = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/77.0.3865.75 Safari/537.36' 19 | headers: Dict[str, str] = None 20 | 21 | def __post_init__(self): 22 | """初始化默认请求头""" 23 | self.headers = { 24 | 'accept': 'application/json', 25 | 'accept-language': 'zh-CN,zh;q=0.9,zh-TW;q=0.8,en-US;q=0.7,en;q=0.6', 26 | 'dnt': '1', 27 | 'referer': 'https://www.pixiv.net/', 28 | 'sec-fetch-mode': 'cors', 29 | 'sec-fetch-site': 'same-origin', 30 | 'user-agent': self.user_agent 31 | } 32 | 33 | # 全局配置实例 34 | REDIS_CONFIG = RedisConfig() 35 | PIXIV_CONFIG = PixivConfig() 36 | 37 | # Redis键模式 38 | class RedisKeys: 39 | """Redis键定义""" 40 | COOKIE = 'cookie' 41 | DOWNLOADED_IMAGE = 'downloaded:{pid}_p{page}' # 已下载的图片页 42 | DOWNLOADED_WORK = 'downloaded:{pid}' # 已完成的作品 43 | TOTAL_PAGES = 'total_pages:{pid}' # 作品总页数 44 | USER_ID = '{illust_id}' # 作品作者ID 45 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Created by .ignore support plugin (hsz.mobi) 2 | ### Python template 3 | # Byte-compiled / optimized / DLL files 4 | __pycache__/ 5 | *.py[cod] 6 | *$py.class 7 | 8 | # C extensions 9 | *.so 10 | 11 | # Distribution / packaging 12 | .Python 13 | build/ 14 | develop-eggs/ 15 | dist/ 16 | downloads/ 17 | eggs/ 18 | .eggs/ 19 | lib/ 20 | lib64/ 21 | parts/ 22 | sdist/ 23 | var/ 24 | wheels/ 25 | pip-wheel-metadata/ 26 | share/python-wheels/ 27 | *.egg-info/ 28 | .installed.cfg 29 | *.egg 30 | MANIFEST 31 | 32 | # PyInstaller 33 | # Usually these files are written by a python script from a template 34 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 35 | *.manifest 36 | *.spec 37 | 38 | # Installer logs 39 | pip-log.txt 40 | pip-delete-this-directory.txt 41 | 42 | # Unit test / coverage reports 43 | htmlcov/ 44 | .tox/ 45 | .nox/ 46 | .coverage 47 | .coverage.* 48 | .cache 49 | nosetests.xml 50 | coverage.xml 51 | *.cover 52 | .hypothesis/ 53 | .pytest_cache/ 54 | 55 | # Translations 56 | *.mo 57 | *.pot 58 | 59 | # Django stuff: 60 | *.log 61 | local_settings.py 62 | db.sqlite3 63 | db.sqlite3-journal 64 | 65 | # Flask stuff: 66 | instance/ 67 | .webassets-cache 68 | 69 | # Scrapy stuff: 70 | .scrapy 71 | 72 | # Sphinx documentation 73 | docs/_build/ 74 | 75 | # PyBuilder 76 | target/ 77 | 78 | # Jupyter Notebook 79 | .ipynb_checkpoints 80 | 81 | # IPython 82 | profile_default/ 83 | ipython_config.py 84 | 85 | # pyenv 86 | .python-version 87 | 88 | # pipenv 89 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 90 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 91 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 92 | # install all needed dependencies. 93 | #Pipfile.lock 94 | 95 | # celery beat schedule file 96 | celerybeat-schedule 97 | 98 | # SageMath parsed files 99 | *.sage.py 100 | 101 | # Environments 102 | .env 103 | .venv 104 | env/ 105 | venv/ 106 | ENV/ 107 | env.bak/ 108 | venv.bak/ 109 | 110 | # Spyder project settings 111 | .spyderproject 112 | .spyproject 113 | 114 | # Rope project settings 115 | .ropeproject 116 | 117 | # mkdocs documentation 118 | /site 119 | 120 | # mypy 121 | .mypy_cache/ 122 | .dmypy.json 123 | dmypy.json 124 | 125 | # Pyre type checker 126 | .pyre/ 127 | 128 | 129 | 130 | pixiv.json 131 | img/ 132 | .idea/inspectionProfiles/profiles_settings.xml 133 | .idea/misc.xml 134 | .idea/modules.xml 135 | .idea/PixivSpider.iml 136 | .idea/vcs.xml 137 | .idea/.gitignore 138 | test.py 139 | .idea/dictionaries/i.xml 140 | .DS_Store 141 | .idea/.name 142 | .idea/workspace.xml 143 | -------------------------------------------------------------------------------- /main.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | """ 3 | Pixiv爬虫 - 主程序入口 4 | 环境需求:Python3.8+ / Redis 5 | """ 6 | import sys 7 | from typing import NoReturn 8 | import requests.packages.urllib3 9 | from rich.console import Console 10 | import redis.exceptions 11 | 12 | from pixiv_spider import PixivSpider 13 | import redis_monitor 14 | from config import REDIS_CONFIG 15 | 16 | # 禁用SSL警告 17 | requests.packages.urllib3.disable_warnings() 18 | 19 | console = Console() 20 | 21 | def show_main_menu() -> NoReturn: 22 | """显示主菜单并处理用户选择""" 23 | while True: 24 | try: 25 | console.print("\n=== PixivSpider ===") 26 | console.print("1. 爬取每日排行榜") 27 | console.print("2. Redis数据库操作") 28 | console.print("3. 退出程序") 29 | 30 | choice = console.input("\n请选择操作 (1-3): ") 31 | 32 | if choice == "1": 33 | run_spider() 34 | elif choice == "2": 35 | run_redis_monitor() 36 | elif choice == "3": 37 | console.print("\n[green]再见![/green]") 38 | sys.exit(0) 39 | else: 40 | console.print("\n[red]无效的选择,请重试[/red]") 41 | 42 | except KeyboardInterrupt: 43 | console.print("\n\n[yellow]检测到Ctrl+C,正在安全退出...[/yellow]") 44 | sys.exit(0) 45 | except Exception as e: 46 | console.print(f"\n[red]发生错误:{str(e)}[/red]") 47 | 48 | def run_spider() -> None: 49 | """运行Pixiv爬虫""" 50 | console.print("\n=== 启动PixivSpider ===") 51 | console.print("[yellow]确保已安装并启动Redis服务[/yellow]") 52 | console.print("[yellow]确保已准备好有效的Pixiv Cookie[/yellow]") 53 | 54 | while True: 55 | try: 56 | console.print("\n[cyan]可用的Redis数据库:[/cyan]") 57 | min_db, max_db = REDIS_CONFIG.db_range 58 | for i in range(min_db, max_db + 1): 59 | console.print(f"{i}.DB{i}") 60 | 61 | db_choice = console.input("\n请选择Redis数据库: ") 62 | db_num = int(db_choice) 63 | 64 | if min_db <= db_num <= max_db: 65 | spider = PixivSpider(db_num) 66 | spider.run() 67 | break 68 | else: 69 | console.print(f"[red]错误:请输入{min_db}到{max_db}之间的数字[/red]") 70 | 71 | except redis.exceptions.ConnectionError: 72 | console.print('[red]错误:无法连接到Redis服务,请确保Redis服务正在运行[/red]') 73 | break 74 | except ValueError: 75 | console.print("[red]错误:请输入有效的数字[/red]") 76 | except KeyboardInterrupt: 77 | console.print('\n[yellow]用户中断运行[/yellow]') 78 | break 79 | except Exception as e: 80 | console.print(f'[red]发生错误:{str(e)}[/red]') 81 | break 82 | 83 | def run_redis_monitor() -> None: 84 | """运行Redis管理工具""" 85 | console.print("\n=== 启动Redis管理工具 ===") 86 | redis_monitor.show_menu() 87 | 88 | def check_dependencies() -> None: 89 | """检查并安装依赖包""" 90 | try: 91 | import redis 92 | import requests 93 | from rich import console, progress, layout, panel 94 | except ImportError: 95 | console.print('[yellow]检测到缺少必要包!正在尝试安装!.....[/yellow]') 96 | import os 97 | os.system('pip install -r requirements.txt') 98 | 99 | # 重新导入以验证安装 100 | import redis 101 | import requests 102 | from rich import console, progress, layout, panel 103 | 104 | console.print('[green]依赖安装完成[/green]') 105 | 106 | if __name__ == "__main__": 107 | try: 108 | check_dependencies() 109 | show_main_menu() 110 | except Exception as e: 111 | console.print(f"[red]程序启动失败:{str(e)}[/red]") 112 | sys.exit(1) 113 | -------------------------------------------------------------------------------- /redis_client.py: -------------------------------------------------------------------------------- 1 | """Redis客户端管理""" 2 | from typing import Optional 3 | import redis 4 | from redis.connection import ConnectionPool 5 | from config import REDIS_CONFIG, RedisKeys 6 | 7 | class RedisClient: 8 | """Redis客户端管理器,使用连接池""" 9 | _pools: dict[int, ConnectionPool] = {} 10 | _instance: Optional['RedisClient'] = None 11 | 12 | def __new__(cls) -> 'RedisClient': 13 | """确保单例""" 14 | if cls._instance is None: 15 | cls._instance = super().__new__(cls) 16 | return cls._instance 17 | 18 | def __init__(self): 19 | """初始化客户端管理器""" 20 | if not hasattr(self, '_initialized'): 21 | self._initialized = True 22 | self._current_db = 0 23 | self._redis: Optional[redis.Redis] = None 24 | self._init_connection() 25 | 26 | def _get_pool(self, db: int) -> ConnectionPool: 27 | """获取指定数据库的连接池""" 28 | if db not in self._pools: 29 | self._pools[db] = redis.ConnectionPool( 30 | host=REDIS_CONFIG.host, 31 | port=REDIS_CONFIG.port, 32 | db=db, 33 | max_connections=REDIS_CONFIG.max_connections, 34 | decode_responses=True 35 | ) 36 | return self._pools[db] 37 | 38 | def _init_connection(self) -> None: 39 | """初始化当前数据库的连接""" 40 | self._redis = redis.Redis( 41 | connection_pool=self._get_pool(self._current_db) 42 | ) 43 | 44 | def select_db(self, db: int) -> bool: 45 | """ 46 | 切换到指定数据库 47 | 48 | 参数: 49 | db: 数据库编号 50 | 51 | 返回: 52 | bool: 成功返回True,失败返回False 53 | """ 54 | min_db, max_db = REDIS_CONFIG.db_range 55 | if not min_db <= db <= max_db: 56 | return False 57 | 58 | if db != self._current_db: 59 | self._current_db = db 60 | self._init_connection() 61 | return True 62 | 63 | @property 64 | def client(self) -> redis.Redis: 65 | """获取当前Redis客户端""" 66 | return self._redis 67 | 68 | def get_cookie(self) -> Optional[str]: 69 | """获取存储的Pixiv cookie""" 70 | return self._redis.get(RedisKeys.COOKIE) 71 | 72 | def set_cookie(self, cookie: str) -> None: 73 | """存储Pixiv cookie""" 74 | self._redis.set(RedisKeys.COOKIE, cookie) 75 | 76 | def is_image_downloaded(self, pid: str, page: int) -> bool: 77 | """检查特定图片页是否已下载""" 78 | key = RedisKeys.DOWNLOADED_IMAGE.format(pid=pid, page=page) 79 | return self._redis.get(key) == 'true' 80 | 81 | def mark_image_downloaded(self, pid: str, page: int) -> None: 82 | """标记特定图片页为已下载""" 83 | key = RedisKeys.DOWNLOADED_IMAGE.format(pid=pid, page=page) 84 | self._redis.set(key, 'true') 85 | 86 | def is_work_complete(self, pid: str) -> bool: 87 | """检查作品是否已完全下载""" 88 | key = RedisKeys.DOWNLOADED_WORK.format(pid=pid) 89 | return self._redis.get(key) == 'complete' 90 | 91 | def mark_work_complete(self, pid: str) -> None: 92 | """标记作品为已完全下载""" 93 | key = RedisKeys.DOWNLOADED_WORK.format(pid=pid) 94 | self._redis.set(key, 'complete') 95 | 96 | def get_total_pages(self, pid: str) -> Optional[int]: 97 | """获取作品总页数""" 98 | key = RedisKeys.TOTAL_PAGES.format(pid=pid) 99 | value = self._redis.get(key) 100 | return int(value) if value else None 101 | 102 | def set_total_pages(self, pid: str, total: int) -> None: 103 | """设置作品总页数""" 104 | key = RedisKeys.TOTAL_PAGES.format(pid=pid) 105 | self._redis.set(key, str(total)) 106 | 107 | def store_user_id(self, illust_id: str, user_id: str) -> None: 108 | """存储作品作者ID""" 109 | key = RedisKeys.USER_ID.format(illust_id=illust_id) 110 | self._redis.set(key, user_id) 111 | 112 | def get_db_stats(self) -> tuple[int, list[str]]: 113 | """ 114 | 获取当前数据库统计信息 115 | 116 | 返回: 117 | tuple: (作品数量, 作品ID列表) 118 | """ 119 | pattern = RedisKeys.DOWNLOADED_IMAGE.format(pid='*', page='0') 120 | work_keys = self._redis.keys(pattern) 121 | work_ids = [key.split(':')[1].split('_')[0] for key in work_keys] 122 | return len(work_ids), work_ids 123 | 124 | def clear_db(self) -> None: 125 | """清空当前数据库""" 126 | self._redis.flushdb() 127 | 128 | def close(self) -> None: 129 | """关闭所有连接池""" 130 | for pool in self._pools.values(): 131 | pool.disconnect() 132 | self._pools.clear() 133 | -------------------------------------------------------------------------------- /pixiv_download.py: -------------------------------------------------------------------------------- 1 | """Pixiv下载组件""" 2 | import os 3 | import re 4 | from typing import Optional, Union 5 | import requests 6 | from rich.progress import Progress 7 | 8 | from config import PIXIV_CONFIG 9 | from redis_client import RedisClient 10 | 11 | class PixivDownloader: 12 | """处理Pixiv图片下载""" 13 | 14 | def __init__(self, headers: dict, progress: Progress): 15 | """ 16 | 初始化下载器 17 | 18 | 参数: 19 | headers: 带cookie的请求头 20 | progress: Rich进度条实例 21 | """ 22 | self.headers = headers 23 | self.progress = progress 24 | self.redis = RedisClient() 25 | 26 | def download_image(self, url: str) -> bool: 27 | """ 28 | 下载单张图片 29 | 30 | 参数: 31 | url: 图片URL 32 | 33 | 返回: 34 | bool: 成功返回True,失败返回False 35 | """ 36 | # 从URL提取图片信息 37 | match = re.search(r'/(\d+)_p(\d+)\.([a-z]+)$', url) 38 | if not match: 39 | return False 40 | 41 | illust_id, page_num, extension = match.groups() 42 | file_name = f"{illust_id}_p{page_num}.{extension}" 43 | 44 | # 检查是否已下载 45 | if self.redis.is_image_downloaded(illust_id, page_num): 46 | return True 47 | 48 | # 确保下载目录存在 49 | if not os.path.isdir('./img'): 50 | os.makedirs('./img') 51 | 52 | # 下载重试机制 53 | for attempt in range(3): 54 | try: 55 | response = requests.get( 56 | url, 57 | headers=self.headers, 58 | timeout=15, 59 | verify=False 60 | ) 61 | if response.status_code == 200: 62 | # 保存图片 63 | with open(f'./img/{file_name}', 'wb') as fp: 64 | fp.write(response.content) 65 | 66 | # 更新Redis记录 67 | self.redis.mark_image_downloaded(illust_id, page_num) 68 | 69 | # 更新总页数 70 | total_pages = self.redis.get_total_pages(illust_id) 71 | if not total_pages: 72 | self.redis.set_total_pages(illust_id, int(page_num) + 1) 73 | elif int(page_num) + 1 == total_pages: 74 | # 检查作品是否完成 75 | all_downloaded = all( 76 | self.redis.is_image_downloaded(illust_id, i) 77 | for i in range(total_pages) 78 | ) 79 | if all_downloaded: 80 | self.redis.mark_work_complete(illust_id) 81 | 82 | return True 83 | 84 | except requests.RequestException: 85 | if attempt == 2: # 最后一次尝试失败 86 | return False 87 | continue 88 | 89 | return False 90 | 91 | def download_work(self, work_id: str) -> bool: 92 | """ 93 | 下载作品的所有图片 94 | 95 | 参数: 96 | work_id: Pixiv作品ID 97 | 98 | 返回: 99 | bool: 全部成功返回True,否则False 100 | """ 101 | # 跳过已完成的作品 102 | if self.redis.is_work_complete(work_id): 103 | return True 104 | 105 | try: 106 | # 获取图片URL列表 107 | response = requests.get( 108 | PIXIV_CONFIG.ajax_url.format(work_id), 109 | headers=self.headers, 110 | verify=False 111 | ) 112 | data = response.json() 113 | 114 | if data.get('error'): 115 | return False 116 | 117 | images = data.get('body', []) 118 | if not images: 119 | return False 120 | 121 | # 下载每张图片 122 | if len(images) > 1: 123 | # 多图作品 124 | subtask_id = self.progress.add_task( 125 | f"[yellow]PID:{work_id}", 126 | total=len(images) 127 | ) 128 | 129 | success = True 130 | for image in images: 131 | if 'urls' not in image or 'original' not in image['urls']: 132 | success = False 133 | continue 134 | 135 | if not self.download_image(image['urls']['original']): 136 | success = False 137 | 138 | self.progress.update(subtask_id, advance=1) 139 | 140 | self.progress.remove_task(subtask_id) 141 | return success 142 | 143 | else: 144 | # 单图作品 145 | if 'urls' not in images[0] or 'original' not in images[0]['urls']: 146 | return False 147 | return self.download_image(images[0]['urls']['original']) 148 | 149 | except (requests.RequestException, KeyError, ValueError): 150 | return False 151 | -------------------------------------------------------------------------------- /pixiv_spider.py: -------------------------------------------------------------------------------- 1 | """ 2 | Pixiv爬虫 - 每日排行榜下载 3 | 环境需求:Python3.8+ / Redis 4 | """ 5 | from typing import Generator, List, Dict, Any 6 | import requests 7 | from rich.console import Console 8 | from rich.progress import ( 9 | Progress, 10 | BarColumn, 11 | TaskProgressColumn, 12 | TextColumn, 13 | SpinnerColumn 14 | ) 15 | from rich.live import Live 16 | from rich.layout import Layout 17 | from rich.panel import Panel 18 | from rich.console import Group 19 | 20 | from config import PIXIV_CONFIG 21 | from redis_client import RedisClient 22 | from pixiv_download import PixivDownloader 23 | 24 | requests.packages.urllib3.disable_warnings() 25 | 26 | class PixivSpider: 27 | """Pixiv每日排行榜爬虫""" 28 | 29 | TOTAL_IMAGES = 500 # 每日排行榜总图片数 30 | 31 | def __init__(self, db: int = 0): 32 | """ 33 | 初始化爬虫 34 | 35 | 参数: 36 | db: Redis数据库编号(0-5) 37 | """ 38 | # 设置Redis 39 | self.redis = RedisClient() 40 | if not self.redis.select_db(db): 41 | raise ValueError(f"无效的Redis数据库编号: {db}") 42 | 43 | # 设置界面组件 44 | self.console = Console() 45 | self._setup_ui() 46 | 47 | # 初始化状态 48 | self.headers = None 49 | self.current_ranking_data = [] 50 | self.failed_works = [] 51 | 52 | def _setup_ui(self) -> None: 53 | """设置Rich界面组件""" 54 | # 创建布局 55 | self.layout = Layout() 56 | self.layout.split( 57 | Layout(name="PixivSpider", ratio=8), 58 | Layout(name="progress", ratio=2) 59 | ) 60 | 61 | # 创建进度条 62 | self.progress = Progress( 63 | TextColumn("[bold blue]{task.description}"), 64 | BarColumn(bar_width=40), 65 | TaskProgressColumn(), 66 | TextColumn("{task.fields[speed]}"), 67 | console=Console(stderr=True), 68 | expand=True 69 | ) 70 | 71 | # 设置日志面板 72 | self.log_messages = [] 73 | self.main_task_id = self.progress.add_task( 74 | "[cyan]总体进度", 75 | total=self.TOTAL_IMAGES, 76 | speed="" 77 | ) 78 | 79 | def _update_log(self, message: str) -> None: 80 | """更新日志显示""" 81 | self.log_messages.append(message) 82 | if len(self.log_messages) > 18: 83 | self.log_messages.pop(0) 84 | log_group = Group(*self.log_messages) 85 | self.layout["PixivSpider"].update( 86 | Panel( 87 | log_group, 88 | title="PixivSpider", 89 | title_align="left", 90 | border_style="cyan", 91 | padding=(0, 1) 92 | ) 93 | ) 94 | 95 | def _setup_session(self) -> None: 96 | """设置请求会话""" 97 | cookie = self.redis.get_cookie() 98 | if not cookie: 99 | cookie = input('请输入一个cookie:') 100 | self.redis.set_cookie(cookie) 101 | 102 | self.headers = PIXIV_CONFIG.headers.copy() 103 | self.headers['cookie'] = cookie 104 | 105 | def get_ranking_page(self, page: int) -> None: 106 | """ 107 | 获取排行榜单页数据 108 | 109 | 参数: 110 | page: 页码(1-10) 111 | """ 112 | params = { 113 | 'mode': 'daily', 114 | 'content': 'illust', 115 | 'p': str(page), 116 | 'format': 'json' 117 | } 118 | 119 | response = requests.get( 120 | PIXIV_CONFIG.top_url, 121 | params=params, 122 | headers=self.headers, 123 | verify=False 124 | ) 125 | data = response.json() 126 | self.current_ranking_data = data['contents'] 127 | 128 | def process_ranking_data(self) -> Generator[str, None, None]: 129 | """ 130 | 处理当前排行榜数据 131 | 132 | 生成: 133 | str: 作品ID 134 | """ 135 | for item in self.current_ranking_data: 136 | work_id = str(item['illust_id']) 137 | user_id = str(item['user_id']) 138 | self.redis.store_user_id(work_id, user_id) 139 | yield work_id 140 | 141 | def run(self) -> None: 142 | """运行爬虫""" 143 | self._setup_session() 144 | downloader = PixivDownloader(self.headers, self.progress) 145 | 146 | with Live(self.layout, self.console, refresh_per_second=10): 147 | self.layout["progress"].update(self.progress) 148 | self._update_log('[cyan]开始抓取...[/cyan]') 149 | 150 | # 处理排行榜页面 151 | for page in range(1, 11): 152 | try: 153 | self.get_ranking_page(page) 154 | for work_id in self.process_ranking_data(): 155 | if not downloader.download_work(work_id): 156 | self.failed_works.append(work_id) 157 | self.progress.update(self.main_task_id, advance=1) 158 | 159 | except requests.RequestException as e: 160 | self._update_log(f'[red]获取排行榜第{page}页时发生错误:{str(e)}[/red]') 161 | continue 162 | 163 | # 清理失败作品的记录 164 | for work_id in self.failed_works: 165 | self.redis.client.delete(work_id) 166 | 167 | self._update_log('[green]爬虫运行完成[/green]') 168 | -------------------------------------------------------------------------------- /redis_monitor.py: -------------------------------------------------------------------------------- 1 | """Redis监控和管理工具""" 2 | from typing import Optional, Dict 3 | import sys 4 | from rich.console import Console 5 | from rich.table import Table 6 | from rich.prompt import Prompt, Confirm 7 | 8 | from redis_client import RedisClient 9 | from config import REDIS_CONFIG 10 | 11 | console = Console() 12 | 13 | class RedisMonitor: 14 | """Redis监控和管理界面""" 15 | 16 | def __init__(self): 17 | """初始化监控器""" 18 | self.redis = RedisClient() 19 | 20 | def _show_db_info(self, db_index: int) -> None: 21 | """ 22 | 显示数据库详细信息 23 | 24 | 参数: 25 | db_index: 数据库编号 26 | """ 27 | try: 28 | self.redis.select_db(db_index) 29 | 30 | table = Table(title=f"数据库 db{db_index} 信息") 31 | table.add_column("项目", style="cyan") 32 | table.add_column("值", style="green") 33 | 34 | # Cookie状态 35 | cookie = self.redis.get_cookie() 36 | table.add_row( 37 | "Cookie状态", 38 | cookie[:30] + "..." if cookie else "未设置" 39 | ) 40 | 41 | # 作品统计 42 | work_count, work_ids = self.redis.get_db_stats() 43 | table.add_row("已下载作品数", str(work_count)) 44 | 45 | console.print(table) 46 | 47 | except Exception as e: 48 | console.print(f"[red]获取数据库信息时出错:{str(e)}[/red]") 49 | 50 | def show_status(self) -> None: 51 | """显示Redis状态和数据库信息""" 52 | try: 53 | # 获取活跃数据库 54 | active_dbs = [] 55 | min_db, max_db = REDIS_CONFIG.db_range 56 | for db in range(min_db, max_db + 1): 57 | if self.redis.select_db(db): 58 | work_count, _ = self.redis.get_db_stats() 59 | if work_count > 0: 60 | active_dbs.append(db) 61 | 62 | if not active_dbs: 63 | console.print("\n[yellow]当前没有活跃的数据库[/yellow]") 64 | return 65 | 66 | # 显示数据库列表 67 | db_list = ", ".join(f"db{db}" for db in active_dbs) 68 | console.print(f"\n[cyan]活跃的数据库: {db_list}[/cyan]") 69 | 70 | # 显示详细信息 71 | if len(active_dbs) == 1: 72 | self._show_db_info(active_dbs[0]) 73 | else: 74 | while True: 75 | db = Prompt.ask( 76 | "请选择要查看的数据库编号", 77 | choices=[str(db) for db in active_dbs] 78 | ) 79 | self._show_db_info(int(db)) 80 | break 81 | 82 | except Exception as e: 83 | console.print(f"[red]获取Redis状态时出错:{str(e)}[/red]") 84 | 85 | def clear_database(self) -> None: 86 | """清空Redis数据库""" 87 | try: 88 | # 获取活跃数据库 89 | active_dbs = [] 90 | min_db, max_db = REDIS_CONFIG.db_range 91 | for db in range(min_db, max_db + 1): 92 | if self.redis.select_db(db): 93 | work_count, _ = self.redis.get_db_stats() 94 | if work_count > 0: 95 | active_dbs.append(db) 96 | 97 | if not active_dbs: 98 | console.print("\n[yellow]当前没有活跃的数据库[/yellow]") 99 | return 100 | 101 | # 显示数据库列表 102 | db_list = ", ".join(f"db{db}" for db in active_dbs) 103 | console.print(f"\n[cyan]活跃的数据库: {db_list}[/cyan]") 104 | 105 | # 显示选项 106 | console.print("\n清空选项:") 107 | console.print("1. 清空指定数据库") 108 | console.print("2. 清空所有数据库") 109 | console.print("3. 取消操作") 110 | 111 | choice = Prompt.ask("请选择操作", choices=["1", "2", "3"]) 112 | 113 | if choice == "1": 114 | if len(active_dbs) == 1: 115 | db = active_dbs[0] 116 | if Confirm.ask(f"确定要清空数据库 db{db} 吗?"): 117 | self.redis.select_db(db) 118 | self.redis.clear_db() 119 | console.print(f"[green]数据库 db{db} 已清空[/green]") 120 | else: 121 | db = int(Prompt.ask( 122 | "请选择要清空的数据库编号", 123 | choices=[str(db) for db in active_dbs] 124 | )) 125 | if Confirm.ask(f"确定要清空数据库 db{db} 吗?"): 126 | self.redis.select_db(db) 127 | self.redis.clear_db() 128 | console.print(f"[green]数据库 db{db} 已清空[/green]") 129 | 130 | elif choice == "2": 131 | if Confirm.ask("确定要清空所有数据库吗?"): 132 | for db in range(min_db, max_db + 1): 133 | self.redis.select_db(db) 134 | self.redis.clear_db() 135 | console.print("[green]所有数据库已清空[/green]") 136 | 137 | except Exception as e: 138 | console.print(f"[red]清空数据库时出错:{str(e)}[/red]") 139 | 140 | def run(self) -> None: 141 | """运行监控界面""" 142 | while True: 143 | console.print("\n=== Redis管理工具 ===") 144 | console.print("1. 显示状态") 145 | console.print("2. 清空数据库") 146 | console.print("3. 退出") 147 | 148 | try: 149 | choice = Prompt.ask("请选择操作", choices=["1", "2", "3"]) 150 | 151 | if choice == "1": 152 | self.show_status() 153 | elif choice == "2": 154 | self.clear_database() 155 | else: 156 | break 157 | 158 | except KeyboardInterrupt: 159 | console.print("\n[yellow]用户中断操作[/yellow]") 160 | break 161 | except Exception as e: 162 | console.print(f"[red]发生错误:{str(e)}[/red]") 163 | 164 | def show_menu() -> None: 165 | """Redis监控入口""" 166 | try: 167 | monitor = RedisMonitor() 168 | monitor.run() 169 | except Exception as e: 170 | console.print(f"[red]启动Redis管理工具时出错:{str(e)}[/red]") 171 | 172 | if __name__ == '__main__': 173 | show_menu() 174 | --------------------------------------------------------------------------------