├── README.md ├── gui ├── database.py ├── main.py └── ui2.ui ├── requirements.txt └── spider ├── config ├── .PixivToken.json ├── database.json └── proxies.json ├── database ├── __init__.py ├── database.py └── process_data.py ├── main.py ├── model ├── __init__.py └── setudata_finish.py └── pixivAPI ├── __init__.py ├── _proxies.py ├── pixivFavorites.py └── pixivToken.py /README.md: -------------------------------------------------------------------------------- 1 | ## 色图库用到的所有东西 2 | 3 | 用mongodb储存从pixiv上爬下来的数据 4 | 5 | spider文件夹里是爬虫,从P站的公开收藏夹爬图到数据库里 6 | 7 | select里是筛图的脚本和一个下载脚本,用腾讯ai的鉴黄api给数据库里的数据打标签 8 | 9 | gui里是一个手动筛图的gui.. 10 | 11 | api里是给qqbot调用的api 12 | 13 | 14 | 15 | ### config 16 | 17 | | config | | 18 | | ---------- | -------------- | 19 | | mongodb | 数据库地址 | 20 | | database | 数据库 | 21 | | collection | 数据表 | 22 | | path | 下载的色图路径 | 23 | | APPKEY | 腾讯ai的appkey | 24 | | APPID | 腾讯ai的appid | 25 | | username | pixiv的用户名 | 26 | | password | pixiv的密码 | 27 | 28 | 爬下来的数据: 29 | ![image-20200614113034532](https://cdn.jsdelivr.net/gh/yuban10703/BlogImgdata/img/20200614113034.png) 30 | 31 | 简陋的gui: 32 | 33 | ![image-20200614113234090](https://cdn.jsdelivr.net/gh/yuban10703/BlogImgdata/img/20200614113234.png) 34 | 35 | 有问题可以提iss.. -------------------------------------------------------------------------------- /gui/database.py: -------------------------------------------------------------------------------- 1 | import pymongo 2 | from bson import ObjectId 3 | 4 | 5 | class DataBase: 6 | myclient = pymongo.MongoClient('mongodb+srv://username:password@cludn.mongodb.net/setu?retryWrites=true&w=majority') # 数据库地址 7 | mydb = myclient['setu'] # 数据库 8 | setu = mydb['setu_v5'] # 集合 9 | setu_del = mydb['setu_del_v5'] # 集合 10 | 11 | @classmethod 12 | def init_gui(cls): 13 | return list(cls.setu.find().sort('_id', 1).limit(1))[0] 14 | 15 | @classmethod 16 | def previous(cls, _id: str): 17 | return list(cls.setu.find({'_id': {'$lt': ObjectId(_id)}}).sort('_id', -1).limit(1))[0] 18 | 19 | @classmethod 20 | def next(cls, _id: str): 21 | return list(cls.setu.find({'_id': {'$gt': ObjectId(_id)}}).sort('_id', 1).limit(1))[0] 22 | 23 | @classmethod 24 | def next_r18(cls, _id: str): 25 | return list(cls.setu.find({'r18': True, '_id': {'$gt': ObjectId(_id)}}).sort('_id', 1).limit(1))[0] 26 | 27 | @classmethod 28 | def next_not_r18(cls, _id: str): 29 | return list(cls.setu.find({'r18': False, '_id': {'$gt': ObjectId(_id)}}).sort('_id', 1).limit(1))[0] 30 | 31 | @classmethod 32 | def getdata(cls, _id: str): 33 | if data := cls.setu.find_one({'_id': ObjectId(_id.replace("\n", ""))}): 34 | return data 35 | else: 36 | return cls.next(_id) 37 | 38 | @classmethod 39 | def find(cls, pid, page=None): 40 | if page: 41 | return cls.setu.find_one({'artwork.id': pid, 'page': page}) 42 | return cls.setu.find_one({'artwork.id': pid}) 43 | 44 | @classmethod 45 | def updateType(cls, _id: str, r18: bool): # 更新type 46 | return cls.setu.update_one({'_id': ObjectId(_id)}, {'$set': {'r18': r18}}).matched_count 47 | 48 | @classmethod 49 | def updateTags(cls, _id: str, tags: list): # 更新type 50 | return cls.setu.update_one({'_id': ObjectId(_id)}, {'$set': {'tags': tags}}).matched_count 51 | 52 | @classmethod 53 | def delsetu(cls, _id, pid, page): 54 | print('删除: {}'.format(cls.setu_del.insert_one({'pid': pid, 'page': page}).inserted_id)) 55 | return cls.setu.delete_one({'_id': ObjectId(_id)}).deleted_count 56 | 57 | @classmethod 58 | def unmodified_count(cls) -> int: 59 | return cls.setu.count_documents({'r18': None}) 60 | 61 | @classmethod 62 | def modified_count(cls) -> int: 63 | return cls.setu.count_documents({'r18': {"$ne": None}}) 64 | 65 | @classmethod 66 | def noFiltrate(cls): 67 | return list(cls.setu.find({'r18': None}).sort('r18', 1).limit(1))[0] 68 | -------------------------------------------------------------------------------- /gui/main.py: -------------------------------------------------------------------------------- 1 | import copy 2 | import json 3 | import time 4 | from functools import lru_cache 5 | from io import BytesIO 6 | from threading import Thread 7 | 8 | import httpx 9 | from PIL import Image 10 | from PySide2 import QtGui 11 | from PySide2.QtUiTools import QUiLoader 12 | from PySide2.QtWidgets import QApplication, QMessageBox 13 | 14 | from database import DataBase 15 | 16 | 17 | # 屎山 18 | 19 | class Stats(): 20 | 21 | def __init__(self): 22 | self.session = httpx.Client() 23 | self.ui = QUiLoader().load('ui2.ui') 24 | 25 | # self.ui.label.setScaledContents(True) # 图片大小自适应 26 | # 按钮对应事件 27 | self.ui.sexy.clicked.connect(lambda: self.change_r18_level(False)) 28 | self.ui.porn.clicked.connect(lambda: self.change_r18_level(True)) 29 | self.ui.addtag.clicked.connect(self.addtag) 30 | self.ui.deltag.clicked.connect(self.deltag) 31 | self.ui.previous.clicked.connect(self.previous) 32 | self.ui.next.clicked.connect(self.next) 33 | self.ui.R18.clicked.connect(self.next_R18) 34 | self.ui.NOTR18.clicked.connect(self.next_not_R18) 35 | self.ui.reset.clicked.connect(self.reset) 36 | self.ui.savetag.clicked.connect(self.savetag) 37 | self.ui.delet.clicked.connect(self.delet) 38 | self.ui.noFiltrate.clicked.connect(self.jump_to_noFiltrate) 39 | # 回车事件 40 | self.ui.idedit.returnPressed.connect(self.jumpID) 41 | self.ui.pidedit.returnPressed.connect(self.jumpPid) 42 | self.ui.pageedit.returnPressed.connect(self.jumpPid) 43 | self.ui.taginput.returnPressed.connect(self.addtag) 44 | 45 | # 数据 46 | self.data = {'_id': '610fd717abf9ac91bca84727'} 47 | # self.data_raw = {'_id': '610fd717abf9ac91bca84727'} # 禁止修改 48 | self.data_raw = DataBase.init_gui() # 禁止修改 49 | self.reset() 50 | # test 51 | # 初始化界面 52 | self.init() 53 | 54 | def showSetuinfo(func): 55 | # @wraps(func) 56 | def wrapper(self, *args, **kwargs): 57 | print('刷新界面') 58 | func(self, *args, **kwargs) 59 | self.ui.picinfo.clear() 60 | print(self.data) 61 | self.data["_id"] = str(self.data["_id"]) 62 | self.data['create_date'] = str(self.data['create_date']) 63 | self.ui.picinfo.append(str(json.dumps(self.data, indent=2, ensure_ascii=False))) 64 | self.ui.taglist.clear() 65 | self.ui.taglist.addItems(self.data['tags']) 66 | self.ui.modified_count.setNum(DataBase.modified_count()) 67 | self.ui.unmodified_count.setNum(DataBase.unmodified_count()) 68 | self.ui.r18.setText('R18:{}'.format(self.data['r18'])) 69 | self.ui.sanity_level.setText('sanity_level:{}'.format(self.data['sanity_level'])) 70 | 71 | return wrapper 72 | 73 | def init(self): 74 | Thread(target=self.changePic, args=(self.data_raw['urls']['large'],)).start() 75 | 76 | def copy(self): 77 | self.data = copy.deepcopy(self.data_raw) 78 | 79 | @showSetuinfo 80 | def reset(self): 81 | self.copy() 82 | # self.showSetuinfo() 83 | 84 | def savetag(self): 85 | print('增加TAG:{}'.format(list(set(self.data['tags']).difference(set(self.data_raw['tags']))))) 86 | print('移除TAG:{}'.format(list(set(self.data_raw['tags']).difference(set(self.data['tags']))))) 87 | DataBase.updateTags(self.data_raw['_id'], self.data['tags']) 88 | 89 | def delet(self): 90 | reply = QMessageBox.question(self.ui, 'Message', '是否删除?', QMessageBox.Yes, QMessageBox.No) 91 | if reply == QMessageBox.Yes: 92 | print('删除数量{}'.format( 93 | DataBase.delsetu(self.data_raw['_id'], self.data_raw['artwork']['id'], self.data_raw['page']))) 94 | elif reply == QMessageBox.No: 95 | pass 96 | 97 | @showSetuinfo 98 | def change_r18_level(self, level: bool): 99 | print('r18:{}'.format(level)) 100 | res = DataBase.updateType(self.data_raw['_id'], level) 101 | print('更新数量:{}'.format(res)) 102 | self.data['r18'] = level 103 | # self.showSetuinfo() 104 | 105 | @showSetuinfo 106 | def addtag(self): 107 | if tag := self.ui.taginput.text(): 108 | self.data['tags'].append(tag) 109 | self.ui.taginput.clear() 110 | # self.showSetuinfo() 111 | 112 | @showSetuinfo 113 | def deltag(self): 114 | if tag := self.ui.taglist.currentItem().text(): # 当前选中的tag 115 | self.data['tags'].remove(tag) # 移除 116 | # self.showSetuinfo() 117 | 118 | @showSetuinfo 119 | def previous(self): 120 | self.data_raw = DataBase.previous(self.data_raw['_id']) 121 | Thread(target=self.changePic, args=(self.data_raw['urls']['large'],)).start() 122 | self.copy() 123 | # self.showSetuinfo() 124 | 125 | @showSetuinfo 126 | def next(self): 127 | self.data_raw = DataBase.next(self.data_raw['_id']) 128 | Thread(target=self.changePic, args=(self.data_raw['urls']['large'],)).start() 129 | self.copy() 130 | # self.showSetuinfo() 131 | 132 | @showSetuinfo 133 | def next_R18(self): 134 | self.data_raw = DataBase.next_r18(self.data_raw['_id']) 135 | Thread(target=self.changePic, args=(self.data_raw['urls']['large'],)).start() 136 | self.copy() 137 | # self.showSetuinfo() 138 | 139 | @showSetuinfo 140 | def next_not_R18(self): 141 | self.data_raw = DataBase.next_not_r18(self.data_raw['_id']) 142 | Thread(target=self.changePic, args=(self.data_raw['urls']['large'],)).start() 143 | self.copy() 144 | # self.showSetuinfo() 145 | 146 | @lru_cache(maxsize=10) # 缓存5张图片? 147 | def downloadPic(self, url): 148 | print('我真的在下载') 149 | res = self.session.get(url, headers={'Referer': 'https://www.pixiv.net'}) 150 | if res.status_code == 200: 151 | return res.content 152 | else: 153 | print(res.status_code) 154 | 155 | def changePic(self, url): 156 | pixmap = QtGui.QPixmap() 157 | start = time.time() 158 | pixmap.loadFromData(self.resize(self.ui.label.width(), self.ui.label.height(), self.downloadPic(url))) 159 | print('下载耗时{}s'.format(time.time() - start)) 160 | self.ui.label.setPixmap(pixmap) 161 | Thread(target=self.downloadPic, args=(DataBase.next(self.data_raw['_id'])['urls']['large'],)).start() 162 | 163 | def resize(self, w_box, h_box, pic_bf): 164 | ''' 165 | resize a pil_image object so it will fit into 166 | a box of size w_box times h_box, but retain aspect ratio 167 | 对一个pil_image对象进行缩放,让它在一个矩形框内,还能保持比例 168 | ''' 169 | with Image.open(BytesIO(pic_bf)) as pic: 170 | w, h = pic.size 171 | print(w, h) 172 | f1 = 1.0 * w_box / w # 1.0 forces float division in Python2 173 | f2 = 1.0 * h_box / h 174 | factor = min([f1, f2]) 175 | # print(f1, f2, factor) # test 176 | # use best down-sizing filter 177 | width = int(w * factor) 178 | height = int(h * factor) 179 | pic.resize((width, height), Image.ANTIALIAS) 180 | # pic.scaled((width, height)) 181 | with BytesIO() as bf: 182 | pic.save(bf, format="PNG") 183 | return bf.getvalue() 184 | 185 | @showSetuinfo 186 | def jumpID(self): 187 | # if self.checktype(self.ui.idedit.text()): 188 | self.data_raw = DataBase.getdata(self.ui.idedit.text()) 189 | self.ui.idedit.clear() 190 | Thread(target=self.changePic, args=(self.data_raw['urls']['large'],)).start() 191 | self.copy() 192 | # self.showSetuinfo() 193 | # else: 194 | # QMessageBox().warning(self.ui, '!', '检查输入') 195 | # return 196 | 197 | def checktype(self, data: str): 198 | try: 199 | int(data) 200 | return True 201 | except: 202 | return False 203 | 204 | @showSetuinfo 205 | def jumpPid(self): 206 | if (len(self.ui.pidedit.text()) == 0) or (self.ui.pidedit.text().isspace()): 207 | QMessageBox().information(self.ui, '?', '请输入pid') 208 | return 209 | if (len(self.ui.pageedit.text()) == 0) or (self.ui.pageedit.text().isspace()): 210 | if self.checktype(self.ui.pidedit.text()): 211 | data = DataBase.find(int(self.ui.pidedit.text())) 212 | else: 213 | QMessageBox().warning(self.ui, '!', '检查输入') 214 | return 215 | else: 216 | if self.checktype(self.ui.pidedit.text()) and self.checktype(self.ui.pageedit.text()): 217 | pid = int(self.ui.pidedit.text()) 218 | page = int(self.ui.pageedit.text()) 219 | data = DataBase.find(pid, page) 220 | else: 221 | QMessageBox().warning(self.ui, '!', '检查输入') 222 | return 223 | self.ui.pidedit.clear() 224 | self.ui.pageedit.clear() 225 | if data is None: 226 | QMessageBox().information(self.ui, '?', '无数据') 227 | return 228 | Thread(target=self.changePic, args=(self.data_raw['urls']['large'],)).start() 229 | self.copy() 230 | # self.showSetuinfo() 231 | 232 | @showSetuinfo 233 | def jump_to_noFiltrate(self): 234 | self.data_raw = DataBase.noFiltrate() 235 | Thread(target=self.changePic, args=(self.data_raw['urls']['large'],)).start() 236 | self.copy() 237 | 238 | def test(self): 239 | print(self.ui.idedit.text()) 240 | print(self.ui.label.height()) 241 | print(self.ui.label.width()) 242 | self.ui.idedit.clear() 243 | 244 | 245 | app = QApplication([]) 246 | stats = Stats() 247 | stats.ui.show() 248 | app.exec_() 249 | -------------------------------------------------------------------------------- /gui/ui2.ui: -------------------------------------------------------------------------------- 1 | 2 | 3 | Form 4 | 5 | 6 | Qt::NonModal 7 | 8 | 9 | 10 | 0 11 | 0 12 | 1365 13 | 1047 14 | 15 | 16 | 17 | 筛setu V1.0 18 | 19 | 20 | 21 | QLayout::SetDefaultConstraint 22 | 23 | 24 | 25 | 26 | 27 | 0 28 | 0 29 | 30 | 31 | 32 | 33 | 506 34 | 850 35 | 36 | 37 | 38 | false 39 | 40 | 41 | 42 | 43 | 44 | false 45 | 46 | 47 | false 48 | 49 | 50 | 51 | 52 | 53 | 54 | 55 | 56 | 0 57 | 58 | 59 | 60 | 61 | 6 62 | 63 | 64 | 20 65 | 66 | 67 | 0 68 | 69 | 70 | 0 71 | 72 | 73 | 74 | 75 | 未修改数量 76 | 77 | 78 | 79 | 80 | 81 | 82 | 83 | 84 | 85 | 86 | 87 | 88 | 89 | 已修改数量 90 | 91 | 92 | 93 | 94 | 95 | 96 | false 97 | 98 | 99 | 100 | 101 | 102 | 103 | 104 | 105 | 106 | 107 | 108 | 109 | 0 110 | 0 111 | 112 | 113 | 114 | 115 | 116 | 117 | 118 | 119 | 120 | 121 | 122 | 123 | 124 | 125 | 126 | 127 | 128 | 129 | 130 | 131 | 0 132 | 0 133 | 134 | 135 | 136 | 137 | 138 | 139 | 140 | 141 | 142 | 28 143 | 144 | 145 | 146 | 147 | QLayout::SetDefaultConstraint 148 | 149 | 150 | 151 | 152 | 153 | 0 154 | 0 155 | 156 | 157 | 158 | 159 | 160 | 161 | 162 | 163 | 0 164 | 0 165 | 166 | 167 | 168 | 169 | 40 170 | 25 171 | 172 | 173 | 174 | + 175 | 176 | 177 | 178 | 179 | 180 | 181 | 182 | 0 183 | 0 184 | 185 | 186 | 187 | 188 | 40 189 | 25 190 | 191 | 192 | 193 | - 194 | 195 | 196 | false 197 | 198 | 199 | 200 | 201 | 202 | 203 | 204 | 205 | Qt::Vertical 206 | 207 | 208 | QSizePolicy::Fixed 209 | 210 | 211 | 212 | 0 213 | 11 214 | 215 | 216 | 217 | 218 | 219 | 220 | 221 | 222 | 223 | 224 | 0 225 | 0 226 | 227 | 228 | 229 | TAGS 230 | 231 | 232 | 233 | 234 | 235 | 236 | 237 | 0 238 | 0 239 | 240 | 241 | 242 | 243 | 244 | 245 | 246 | 247 | 248 | 249 | 250 | 251 | 252 | Qt::Vertical 253 | 254 | 255 | QSizePolicy::Fixed 256 | 257 | 258 | 259 | 20 260 | 13 261 | 262 | 263 | 264 | 265 | 266 | 267 | 268 | 0 269 | 270 | 271 | 20 272 | 273 | 274 | 0 275 | 276 | 277 | 0 278 | 279 | 280 | 281 | 282 | 283 | 284 | _id 285 | 286 | 287 | 288 | 289 | 290 | 291 | 292 | 293 | 294 | 295 | 296 | 297 | 298 | pid 299 | 300 | 301 | 302 | 303 | 304 | 305 | 306 | 307 | 308 | page 309 | 310 | 311 | 312 | 313 | 314 | 315 | 316 | 0 317 | 0 318 | 319 | 320 | 321 | 322 | 323 | 324 | 325 | 326 | 327 | 0 328 | 329 | 330 | 331 | 332 | 333 | 0 334 | 0 335 | 336 | 337 | 338 | sexy 339 | 340 | 341 | Z 342 | 343 | 344 | 345 | 346 | 347 | 348 | 349 | 0 350 | 0 351 | 352 | 353 | 354 | porn 355 | 356 | 357 | X 358 | 359 | 360 | 361 | 362 | 363 | 364 | 365 | 366 | 367 | 368 | 369 | 0 370 | 0 371 | 372 | 373 | 374 | 未筛选 375 | 376 | 377 | S 378 | 379 | 380 | 381 | 382 | 383 | 384 | 385 | 0 386 | 0 387 | 388 | 389 | 390 | 重置 391 | 392 | 393 | 394 | 395 | 396 | 397 | 398 | 0 399 | 0 400 | 401 | 402 | 403 | 保存标签 404 | 405 | 406 | Ctrl+Return 407 | 408 | 409 | 410 | 411 | 412 | 413 | 414 | 0 415 | 0 416 | 417 | 418 | 419 | --> 420 | 421 | 422 | Right 423 | 424 | 425 | 426 | 427 | 428 | 429 | 430 | 0 431 | 0 432 | 433 | 434 | 435 | <-- 436 | 437 | 438 | Left 439 | 440 | 441 | 442 | 443 | 444 | 445 | 446 | 0 447 | 0 448 | 449 | 450 | 451 | 删除 452 | 453 | 454 | Del 455 | 456 | 457 | 458 | 459 | 460 | 461 | R18 462 | 463 | 464 | K 465 | 466 | 467 | 468 | 469 | 470 | 471 | NOT R18 472 | 473 | 474 | L 475 | 476 | 477 | 478 | 479 | 480 | 481 | 482 | 483 | 484 | 485 | 486 | 487 | 488 | 489 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | asyncio 2 | pymongo 3 | requests 4 | aiohttp 5 | aiofiles 6 | Pillow 7 | FastAPI 8 | py-cpuinfo 9 | psutil 10 | motor 11 | retrying 12 | -------------------------------------------------------------------------------- /spider/config/.PixivToken.json: -------------------------------------------------------------------------------- 1 | { 2 | "refresh_token": "" 3 | } -------------------------------------------------------------------------------- /spider/config/database.json: -------------------------------------------------------------------------------- 1 | { 2 | "mongodb": "mongodb://username:password@10.1.1.168:27017/setu", 3 | "database": "setu", 4 | "collection": "setu_test", 5 | "collection_del": "setu_del_test" 6 | } -------------------------------------------------------------------------------- /spider/config/proxies.json: -------------------------------------------------------------------------------- 1 | { 2 | "proxies_http": {}, 3 | "proxies_socks": "" 4 | } -------------------------------------------------------------------------------- /spider/database/__init__.py: -------------------------------------------------------------------------------- 1 | from .database import Database 2 | from .process_data import process_data -------------------------------------------------------------------------------- /spider/database/database.py: -------------------------------------------------------------------------------- 1 | import json 2 | import sys 3 | from pathlib import Path 4 | from typing import List 5 | 6 | import pymongo 7 | 8 | from spider.model import Setu 9 | 10 | try: 11 | with open(Path(__file__).absolute().parent.parent / "config" / "database.json", "r", encoding="utf-8") as f: 12 | config = json.load(f) 13 | print("读取database.json成功~") 14 | except Exception as e: 15 | print("database.json载入失败,请检查内容并重新启动~\r\n{}".format(e)) 16 | sys.exit(0) 17 | 18 | myclient = pymongo.MongoClient(config['mongodb']) 19 | mydb = myclient[config['database']] 20 | mycol = mydb[config['collection']] 21 | mycol_del = mydb[config['collection_del']] 22 | 23 | 24 | class Database: 25 | 26 | def __init__(self, data: List[Setu]): 27 | self.collection = mycol 28 | self.collection_del = mycol_del 29 | self.setus = data 30 | 31 | def filter_del(self): 32 | """ 33 | 过滤掉删除过的 34 | :return: 35 | """ 36 | setus_copy = self.setus.copy() 37 | for setu in setus_copy: 38 | if self.collection_del.find_one({'pid': setu.artwork.id, 'page': setu.page}): 39 | self.setus.remove(setu) 40 | 41 | def filter_changing_and_repeating(self): 42 | """ 43 | 过滤重复的和有改变的 44 | :return: 45 | """ 46 | setus_copy = self.setus.copy() 47 | for setu in setus_copy: 48 | if data := self.collection.find_one({'artwork.id': setu.artwork.id, 'page': setu.page}): # 是否存在 49 | if data['urls']['original'] == str(setu.urls.original): 50 | self.setus.remove(setu) 51 | print('数据一致') 52 | else: 53 | print('数据不一致') 54 | print(setu.create_date, '---', data['create_date']) 55 | result = self.collection.delete_many({'artwork.id': setu.artwork.id}) # 删除这个id的所有数据 56 | print("已删除{}条关于id:{}的数据".format(result.deleted_count, setu.artwork.id)) 57 | 58 | def insertData(self): 59 | data_insert = [data.dict() for data in self.setus] 60 | if data_insert == []: 61 | return 62 | result = self.collection.insert_many(data_insert) 63 | for setu in self.setus: 64 | print('{} P:{}'.format(setu.artwork.id, setu.page)) 65 | print('增加:{}'.format(len(result.inserted_ids))) 66 | 67 | def main(self): 68 | self.filter_del() 69 | self.filter_changing_and_repeating() 70 | self.insertData() 71 | print('-' * 20) 72 | -------------------------------------------------------------------------------- /spider/database/process_data.py: -------------------------------------------------------------------------------- 1 | import re 2 | from typing import List 3 | 4 | from spider.model import Setu, Artwork, Author, Size, Url 5 | 6 | 7 | def process_data(data: list) -> List[Setu]: 8 | setu_list = [] 9 | for setu in data: 10 | if setu['page_count'] == 1: # 单页 11 | setu_list.append( 12 | Setu( 13 | artwork=Artwork(title=setu['title'], id=setu['id']), 14 | author=Author(name=re.sub(r'[@,@].*$', '', setu['user']['name']), id=setu['user']['id']), 15 | sanity_level=setu['sanity_level'], 16 | r18=None, 17 | page=0, 18 | create_date=setu['create_date'], 19 | size=Size(width=setu['width'], height=setu['height']), 20 | tags=[tag['name'] for tag in setu['tags']] + 21 | [tag['translated_name'] for tag in setu['tags'] if tag['translated_name'] != None], 22 | urls=Url(original=setu['meta_single_page']['original_image_url'], 23 | large=setu['image_urls']['large'], 24 | medium=setu['image_urls']['medium']) 25 | ) 26 | ) 27 | else: # 多页 28 | for i in range(setu['page_count']): 29 | setu_list.append( 30 | Setu( 31 | artwork=Artwork(title=setu['title'], id=setu['id']), 32 | author=Author(name=re.sub(r'[@,@].*$', '', setu['user']['name']), id=setu['user']['id']), 33 | sanity_level=setu['sanity_level'], 34 | r18=None, 35 | page=i, 36 | create_date=setu['create_date'], 37 | size=Size(width=setu['width'], height=setu['height']), 38 | tags=[tag['name'] for tag in setu['tags']] + 39 | [tag['translated_name'] for tag in setu['tags'] if tag['translated_name'] != None], 40 | urls=Url(original=setu['meta_pages'][i]['image_urls']['original'], 41 | large=setu['meta_pages'][i]['image_urls']['large'], 42 | medium=setu['meta_pages'][i]['image_urls']['medium']) 43 | ) 44 | ) 45 | return setu_list 46 | -------------------------------------------------------------------------------- /spider/main.py: -------------------------------------------------------------------------------- 1 | import random 2 | import time 3 | import urllib.parse as urlparse 4 | 5 | import httpx 6 | 7 | from database import process_data, Database 8 | from pixivAPI import PixivToken, PixivFavorites 9 | 10 | if __name__ == '__main__': 11 | pixivToken = PixivToken() 12 | pixivToken.main() 13 | with httpx.Client() as session: 14 | data = PixivFavorites(pixivToken.tokendata['response']['user']['id'], 15 | pixivToken.tokendata['access_token'], 16 | session).favorites() # 第一次进入收藏夹 17 | print(data) 18 | Database(data=process_data(data['illusts'])).main() 19 | while True: 20 | if data['next_url'] == None: # 到最后一页就停止 21 | print('>>done<<') 22 | break 23 | time.sleep(random.randint(2, 4)) 24 | data = PixivFavorites(pixivToken.tokendata['response']['user']['id'], 25 | pixivToken.tokendata['access_token'], 26 | session).favorites(max_bookmark_id=int( 27 | urlparse.parse_qs(urlparse.urlparse(data['next_url']).query)['max_bookmark_id'][0])) # 第一次进入收藏夹 28 | print(data) 29 | Database(data=process_data(data['illusts'])).main() 30 | # print(data['max_bookmark_id']) 31 | -------------------------------------------------------------------------------- /spider/model/__init__.py: -------------------------------------------------------------------------------- 1 | from .setudata_finish import Setu, Artwork, Author, Size, Url 2 | -------------------------------------------------------------------------------- /spider/model/setudata_finish.py: -------------------------------------------------------------------------------- 1 | from datetime import datetime 2 | from typing import List, Union 3 | 4 | from pydantic import BaseModel, HttpUrl 5 | 6 | 7 | class Url(BaseModel): 8 | original: HttpUrl 9 | large: HttpUrl 10 | medium: HttpUrl 11 | 12 | 13 | class Artwork(BaseModel): 14 | title: str 15 | id: int 16 | 17 | 18 | class Author(BaseModel): 19 | name: str 20 | id: int 21 | 22 | 23 | class Size(BaseModel): 24 | width: int 25 | height: int 26 | 27 | 28 | class Setu(BaseModel): 29 | artwork: Artwork 30 | author: Author 31 | sanity_level: int 32 | r18: Union[bool, None] 33 | page: int 34 | create_date: datetime 35 | size: Size 36 | tags: List[str] 37 | urls: Url 38 | 39 | # 40 | # class Setus(BaseModel): 41 | # data: List[Data] 42 | -------------------------------------------------------------------------------- /spider/pixivAPI/__init__.py: -------------------------------------------------------------------------------- 1 | from .pixivFavorites import PixivFavorites 2 | from .pixivToken import PixivToken 3 | -------------------------------------------------------------------------------- /spider/pixivAPI/_proxies.py: -------------------------------------------------------------------------------- 1 | import json 2 | import sys 3 | from pathlib import Path 4 | 5 | from httpx_socks import SyncProxyTransport 6 | 7 | try: 8 | with open(Path(__file__).absolute().parent.parent / "config" / "proxies.json", "r", encoding="utf-8") as f: 9 | proxiesConfig = json.load(f) 10 | print("读取proxies.json成功~") 11 | except Exception as e: 12 | print("proxies.json载入失败,请检查内容并重新启动~\r\n{}".format(e)) 13 | sys.exit(0) 14 | 15 | if proxies_socks := proxiesConfig.get("proxies_socks"): 16 | transport = SyncProxyTransport.from_url(proxies_socks) 17 | proxies = None 18 | else: 19 | transport = None 20 | proxies = proxiesConfig.get("proxies_http") 21 | -------------------------------------------------------------------------------- /spider/pixivAPI/pixivFavorites.py: -------------------------------------------------------------------------------- 1 | # import httpx 2 | import hashlib 3 | from datetime import datetime 4 | 5 | from retrying import retry 6 | 7 | 8 | class PixivFavorites: 9 | def __init__(self, userid, access_token, session): 10 | self.userid = userid 11 | self.access_token = access_token 12 | self.session = session 13 | 14 | def headers(self) -> dict: 15 | hash_secret = "28c1fdd170a5204386cb1313c7077b34f83e4aaf4aa829ce78c231e05b0bae2c" 16 | X_Client_Time = datetime.utcnow().strftime("%Y-%m-%dT%H:%M:%S+08:00") 17 | X_Client_Hash = hashlib.md5( 18 | (X_Client_Time + hash_secret).encode("utf-8") 19 | ).hexdigest() 20 | return { 21 | 'Authorization': 'Bearer ' + self.access_token, 22 | "User-Agent": "PixivAndroidApp/5.0.197 (Android 10; Redmi 4)", 23 | "Content-Type": "application/x-www-form-urlencoded;charset=UTF-8", 24 | "Accept-Language": "zh_CN_#Hans", 25 | "App-OS": "android", 26 | "App-OS-Version": "10", 27 | "App-Version": "5.0.197", 28 | "X-Client-Time": X_Client_Time, 29 | "X-Client-Hash": X_Client_Hash, 30 | "Host": "app-api.pixiv.net" 31 | } 32 | 33 | @retry(stop_max_attempt_number=5, wait_random_max=2000) 34 | def favorites(self, max_bookmark_id=None) -> dict: 35 | params = {'user_id': self.userid, 36 | 'restrict': 'public'} # 公开收藏夹 37 | if max_bookmark_id: 38 | params['max_bookmark_id'] = max_bookmark_id 39 | # print(params) 40 | res = self.session.get( 41 | url='https://app-api.pixiv.net/v1/user/bookmarks/illust', 42 | params=params, 43 | headers=self.headers() 44 | ).json() 45 | return res 46 | -------------------------------------------------------------------------------- /spider/pixivAPI/pixivToken.py: -------------------------------------------------------------------------------- 1 | import hashlib 2 | import json 3 | import sys 4 | import time 5 | import uuid 6 | from datetime import datetime, timedelta 7 | from pathlib import Path 8 | 9 | import httpx 10 | from apscheduler.schedulers.background import BackgroundScheduler 11 | from retrying import retry 12 | 13 | from ._proxies import proxies, transport 14 | 15 | scheduler = BackgroundScheduler() 16 | 17 | 18 | class PixivToken: 19 | def __init__(self): 20 | self.tokenPath = Path(__file__).absolute().parent.parent / "config" / ".PixivToken.json" 21 | self.tokendata = {} 22 | self.Client = httpx.Client(proxies=proxies, transport=transport) 23 | 24 | def headers(self): 25 | hash_secret = "28c1fdd170a5204386cb1313c7077b34f83e4aaf4aa829ce78c231e05b0bae2c" 26 | X_Client_Time = datetime.utcnow().strftime("%Y-%m-%dT%H:%M:%S+08:00") 27 | X_Client_Hash = hashlib.md5( 28 | (X_Client_Time + hash_secret).encode("utf-8") 29 | ).hexdigest() 30 | headers = { 31 | "User-Agent": "PixivAndroidApp/5.0.197 (Android 10; Redmi 4)", 32 | "Content-Type": "application/x-www-form-urlencoded", 33 | "Accept-Language": "zh_CN_#Hans", 34 | "App-OS": "android", 35 | "App-OS-Version": "10", 36 | "App-Version": "5.0.197", 37 | "X-Client-Time": X_Client_Time, 38 | "X-Client-Hash": X_Client_Hash, 39 | "Host": "oauth.secure.pixiv.net", 40 | "Accept-Encoding": "gzip", 41 | } 42 | return headers 43 | 44 | @retry(stop_max_attempt_number=3, wait_random_max=5000) 45 | def refresh_token(self): 46 | url = "https://oauth.secure.pixiv.net/auth/token" 47 | print("尝试刷新Pixiv_token") 48 | data = { 49 | "client_id": "MOBrBDS8blbauoSck0ZfDbtuzpyT", 50 | "client_secret": "lsACyCD94FhDUtGTXi3QzcFE2uU1hqtDaKeqrdwj", 51 | "grant_type": "refresh_token", 52 | "refresh_token": self.tokendata["refresh_token"], 53 | "device_token": self.tokendata["device_token"] 54 | if "device_token" in self.tokendata.keys() 55 | else uuid.uuid4().hex, 56 | "get_secure_url": "true", 57 | "include_policy": "true", 58 | } 59 | self.tokendata = self.Client.post(url, data=data, headers=self.headers()).json() 60 | self.tokendata["time"] = time.time() 61 | print("刷新token成功~") 62 | self.saveToken() 63 | 64 | def continue_refresh_token(self): 65 | try: 66 | self.refresh_token() 67 | except: 68 | print("刷新失败") 69 | nextTime = 300 70 | else: 71 | nextTime = int( 72 | self.tokendata["expires_in"] - (time.time() - self.tokendata["time"]) 73 | ) 74 | self.addJob(nextTime) 75 | return 76 | 77 | def saveToken(self): 78 | with open(self.tokenPath, "w", encoding="utf-8") as f: 79 | json.dump(self.tokendata, f, indent=4, ensure_ascii=False) 80 | print("PixivToken已保存到.PixivToken.json") 81 | return 82 | 83 | def addJob(self, next_time: int): 84 | print("离下次刷新还有:{}s".format(next_time)) 85 | scheduler.add_job( 86 | self.continue_refresh_token, 87 | next_run_time=datetime.now() + timedelta(seconds=next_time - 1), 88 | misfire_grace_time=30, 89 | ) 90 | 91 | def main(self): 92 | try: 93 | with open(self.tokenPath, "r", encoding="utf-8") as f: 94 | self.tokendata = json.load(f) 95 | print("读取.PixivToken.json成功~") 96 | except Exception as e: 97 | print(".PixivToken.json载入失败,请检查内容并重新启动~\r\n{}".format(e)) 98 | sys.exit(0) 99 | if self.tokendata["refresh_token"] == "": 100 | print("PixivToken不存在") 101 | sys.exit(0) 102 | if "time" not in self.tokendata.keys(): # 没time字段就是第一次启动 103 | self.continue_refresh_token() 104 | return 105 | if time.time() - self.tokendata["time"] >= int( 106 | self.tokendata["expires_in"] 107 | ): # 停止程序后再次启动时间后的间隔时间超过刷新间隔 108 | self.continue_refresh_token() 109 | return 110 | self.addJob( 111 | int(self.tokendata["expires_in"] - (time.time() - self.tokendata["time"])) 112 | ) 113 | --------------------------------------------------------------------------------