├── javbus ├── __init__.py ├── Utils │ ├── __init__.py │ ├── syLogger.py │ ├── syFileOperator.py │ ├── JAVBusMySQLDBManager.py │ └── JAVBusImageDownloadQueue.py ├── middlewares │ └── __init__.py ├── spiders │ ├── __init__.py │ └── JavbusSpider.py ├── items.py ├── middlewares.py ├── settings.py └── pipelines.py ├── require.txt ├── scrapy.cfg ├── .gitignore ├── README.md └── JAVBusImageDownloader.py /javbus/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /javbus/Utils/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /javbus/middlewares/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /require.txt: -------------------------------------------------------------------------------- 1 | requests 2 | pymysql 3 | scrapy 4 | scrapy-splash 5 | scrapyd 6 | scrapyd-client -------------------------------------------------------------------------------- /javbus/spiders/__init__.py: -------------------------------------------------------------------------------- 1 | # This package will contain the spiders of your Scrapy project 2 | # 3 | # Please refer to the documentation for information on how to create and manage 4 | # your spiders. 5 | -------------------------------------------------------------------------------- /scrapy.cfg: -------------------------------------------------------------------------------- 1 | # Automatically created by: scrapy startproject 2 | # 3 | # For more information about the [deploy] section see: 4 | # https://scrapyd.readthedocs.io/en/latest/deploy.html 5 | 6 | [settings] 7 | default = javbus.settings 8 | 9 | [deploy] 10 | url = http://localhost:6800/ 11 | project = javbus 12 | -------------------------------------------------------------------------------- /javbus/items.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define here the models for your scraped items 4 | # 5 | # See documentation in: 6 | # https://doc.scrapy.org/en/latest/topics/items.html 7 | 8 | import scrapy 9 | 10 | 11 | class JavbusItem(scrapy.Item): 12 | # define the fields for your item here like: 13 | # name = scrapy.Field() 14 | title = scrapy.Field() 15 | cover = scrapy.Field() 16 | code = scrapy.Field() 17 | date = scrapy.Field() 18 | duration = scrapy.Field() 19 | series = scrapy.Field() 20 | type = scrapy.Field() 21 | actress = scrapy.Field() 22 | magnet = scrapy.Field() 23 | size = scrapy.Field() 24 | samplePic = scrapy.Field() 25 | link = scrapy.Field() 26 | 27 | 28 | class JavBusImageItem(scrapy.Item): 29 | image_urls = scrapy.Field() 30 | images = scrapy.Field() 31 | image_paths = scrapy.Field() -------------------------------------------------------------------------------- /javbus/Utils/syLogger.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf-8 -*- 2 | import os 3 | import datetime 4 | 5 | 6 | #单例模式 7 | def singleton(cls, *args, **kw): 8 | instances = {} 9 | def _singleton(): 10 | if cls not in instances: 11 | instances[cls] = cls(*args, **kw) 12 | return instances[cls] 13 | return _singleton 14 | 15 | 16 | #log记录类 17 | @singleton 18 | class syLoggerManager(): 19 | 20 | isDebugMode = True 21 | 22 | def __init__(self): 23 | 24 | print ('Logger Init -----> if console shows many times , there is a bug check!') 25 | 26 | # self.logFilePath = os.getcwd() 27 | self.logFilePath = os.path.dirname(os.path.realpath(__file__)) 28 | self.logFilePath = os.path.join(self.logFilePath , 'log.txt') 29 | if os.path.exists(self.logFilePath): 30 | #存在log文件 31 | #py3.0 32 | # f = open(self.logFilePath,'a',encoding='utf8') 33 | #py2.0 34 | f = open(self.logFilePath, 'a') 35 | f.write('\n\n今天日志' + '==============' +str(datetime.date.today()) + '==============') 36 | else: 37 | f = open(self.logFilePath,'w') 38 | f.write('今天日志' + '==============' +str(datetime.date.today()) + '==============') 39 | f.close() 40 | 41 | 42 | def syLog(self,info): 43 | if info is None: 44 | return 45 | if self.isDebugMode == True: 46 | f = open(self.logFilePath,'a') 47 | f.write('\n %s =====> %s'%(str(datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')) , info)) 48 | f.close() 49 | 50 | def syLogManyLines(self,infoArray): 51 | if infoArray is None: 52 | return 53 | if self.isDebugMode == True: 54 | f = open(self.logFilePath, 'a') 55 | for info in infoArray: 56 | 57 | f.write('\n %s' % (info)) 58 | f.close() -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # ---> Python 2 | # Byte-compiled / optimized / DLL files 3 | __pycache__/ 4 | *.py[cod] 5 | *$py.class 6 | .idea 7 | 8 | 9 | # C extensions 10 | *.so 11 | 12 | # Distribution / packaging 13 | .Python 14 | env/ 15 | build/ 16 | develop-eggs/ 17 | dist/ 18 | downloads/ 19 | eggs/ 20 | .eggs/ 21 | lib/ 22 | lib64/ 23 | parts/ 24 | sdist/ 25 | var/ 26 | *.egg-info/ 27 | .installed.cfg 28 | *.egg 29 | 30 | # PyInstaller 31 | # Usually these files are written by a python script from a template 32 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 33 | *.manifest 34 | *.spec 35 | 36 | # Installer logs 37 | pip-log.txt 38 | pip-delete-this-directory.txt 39 | 40 | # Unit test / coverage reports 41 | htmlcov/ 42 | .tox/ 43 | .coverage 44 | .coverage.* 45 | .cache 46 | nosetests.xml 47 | coverage.xml 48 | *,cover 49 | 50 | # Translations 51 | *.mo 52 | *.pot 53 | 54 | # Django stuff: 55 | *.log 56 | 57 | # Sphinx documentation 58 | docs/_build/ 59 | 60 | # PyBuilder 61 | target/ 62 | 63 | # ---> Objective-C 64 | # Xcode 65 | # 66 | # gitignore contributors: remember to update Global/Xcode.gitignore, Objective-C.gitignore & Swift.gitignore 67 | 68 | ## Build generated 69 | build/ 70 | DerivedData 71 | 72 | ## Various settings 73 | *.pbxuser 74 | !default.pbxuser 75 | *.mode1v3 76 | !default.mode1v3 77 | *.mode2v3 78 | !default.mode2v3 79 | *.perspectivev3 80 | !default.perspectivev3 81 | xcuserdata 82 | 83 | ## Other 84 | *.xccheckout 85 | *.moved-aside 86 | *.xcuserstate 87 | *.xcscmblueprint 88 | 89 | ## Obj-C/Swift specific 90 | *.hmap 91 | *.ipa 92 | 93 | # CocoaPods 94 | # 95 | # We recommend against adding the Pods directory to your .gitignore. However 96 | # you should judge for yourself, the pros and cons are mentioned at: 97 | # http://guides.cocoapods.org/using/using-cocoapods.html#should-i-check-the-pods-directory-into-source-control 98 | # 99 | #Pods/ 100 | 101 | # Carthage 102 | # 103 | # Add this line if you want to avoid checking in source code from Carthage dependencies. 104 | # Carthage/Checkouts 105 | 106 | Carthage/Build 107 | 108 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # pyJAVBus 2 | 3 | ### Description: 4 | `JAVBus` 爬虫项目练手项目,老司机专属 5 | 6 | 7 | ### Ajax抓取使用注意事项: 8 | Python 使用 'scrapy-splash' 库 9 | Docker 使用 `scrapinghub/splash` 10 | 11 | ### Warning: 12 | **CentOS Only** 13 | **Python3 Only** 14 | 1. 脚本中运行Docker与清除Swap功能仅在 `Centos` 系统下运用 15 | 2. 请在本地数据添加 `JavBusPython` 数据库 16 | 3. 数据库要求'UTF8'格式,可通过 `phpAdmin`添加,如果格式不对,日文无法插入 17 | 18 | ### USAGE: 19 | 1. 先安装Python依赖库 20 | 21 | ```python 22 | pip install -r require.txt 23 | ``` 24 | 2.安装 Docker Image 25 | 26 | ```c 27 | docker pull scrapinghub/splash 28 | ``` 29 | 30 | ~~3. 运行 Docker~~ 31 | (脚本集成了开启Docker功能 与 清除SwapMemory)功能 32 | 33 | ```c 34 | docker run -p 8050:8050 scrapinghub/splash 35 | ``` 36 | 37 | 4.运行爬虫 38 | 39 | ```python 40 | python -m scrapy crawl JavbusSpider 41 | ``` 42 | 43 | ### PS: 需要修改地方: 44 | `pipelines.py` 中数据库信息 45 | `JavbusSpider.py` 中爬取页面深度,默认第一第二页 46 | 47 | ### 关于部署(非必须,可手动开启脚本): 48 | 49 | 1. 使用 `scrapyd` 与 `scrapyd-client` , 已经包含在 `require.txt` 中 50 | 51 | 2. 开启 `Scrapyd` 服务 52 | 53 | ```c 54 | scrapyd 55 | ``` 56 | 57 | 或者后台开启 58 | 59 | ```c 60 | nohup scrapyd > /dev/null 2>&1 & 61 | ``` 62 | 63 | 开启后可通过 `http://localhost:6800` 访问 64 | 65 | `如果部署在远程服务器,需要访问的话,需要修改Package下的config` 66 | 67 | 如: 68 | 使用 `pip` 安装 , 则在 `Python` 安装环境库下 `xxx(python库路径)/site-packages/scrapyd/default_scrapyd.conf` 69 | 70 | 修改 'bind_address = 0.0.0.0' , 然后开启防火墙规则,然后其他远侧会给你电脑能通过 `http://服务器ip:6800` 访问 `GUI` 控制台 71 | 72 | 3. 部署爬虫 73 | 74 | 进入项目, 修改 `scrapy.cfg` 中 `url` 75 | 76 | 命令行输入 77 | 78 | ```c 79 | scrapyd-deploy [deploy_name] 80 | ``` 81 | 82 | 进行部署,`deploy_name` 为 命令行 83 | 84 | ```c 85 | scrapyd-deploy -l 86 | ``` 87 | 88 | 中显示的名字 89 | 90 | 4. 运行爬虫 91 | 92 | 使用API 93 | 启动一个爬虫 94 | 95 | ```c 96 | curl http://localhost:6800/schedule.json -d project=PROJECT_NAME -d spider=SPIDER_NAME 97 | ``` 98 | 99 | 停止爬虫 100 | 101 | ```c 102 | curl http://localhost:6800/cancel.json -d project=PROJECT_NAME -d job=JOB_ID 103 | ``` 104 | 105 | `project_name` 为爬虫`scrapy.cfg`项目名 106 | 107 | `spider` 为项目中爬虫名字,[Spider文件夹下的] 108 | 109 | 6. 通过访问 `http://ip:6800` 观察爬虫情况 110 | 111 | 112 | 113 | 114 | #### Last But Not Least 115 | 116 | 脚本只是提供提供爬取信息插入数据库收藏,此脚本不包括下载图片功能,如果需要下载图片数据,可运行 117 | 118 | `python JAVBusImageDownloader.py` 119 | 120 | 下载数据库中所有信息中的图片,保存地址为文件夹根目录上一级目录,新建 `PYJavBus` 文件夹中 121 | 122 | 123 | -------------------------------------------------------------------------------- /javbus/Utils/syFileOperator.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf-8 -*- 2 | import os 3 | import shutil 4 | 5 | #文件类型黑名单 6 | #不需要统计文件类型 7 | fileTypeBlackList = ['.git' , '.idea' , '.DS_Store' ] 8 | 9 | 10 | class syFileOperator(): 11 | def __init__(self): 12 | #获取当前文件所在的文件夹 13 | self.currentPath = os.path.dirname(os.path.realpath(__file__)) 14 | 15 | 16 | #判断文件路径是否存在文件 17 | def isExistsFilePath(self, filePath): 18 | if os.path.exists(filePath) == False: 19 | return False 20 | else: 21 | return True 22 | 23 | 24 | #移除文件夹 25 | def removeDirPath(self,dirPath): 26 | if self.isExistsFilePath(dirPath): 27 | shutil.rmtree(dirPath) 28 | 29 | #移除文件 30 | def removeFilePath(self,filePath): 31 | if self.isExistsFilePath(filePath): 32 | os.remove(filePath) 33 | 34 | #创建一个新的文件夹 35 | def createDirPath(self,dirPath): 36 | if self.isExistsFilePath(dirPath): 37 | print ('存在文件夹') 38 | else: 39 | os.mkdir(dirPath) 40 | 41 | #复制文件夹到文件夹 42 | def copyFileWithDir(self,oriPath,desPathDir): 43 | if oriPath is None or desPathDir is None: 44 | return 45 | if self.isExistsFilePath(oriPath) == False: 46 | return 47 | if self.isExistsFilePath(desPathDir) == False: 48 | self.createDirPath(desPathDir) 49 | 50 | 51 | shutil.copy(oriPath, desPathDir) 52 | 53 | 54 | #获取单文件的大小Size(单位 M) 55 | def getFileSize(self,filePath): 56 | if self.isExistsFilePath(filePath) == False: 57 | return 0 58 | else: 59 | size = os.path.getsize(filePath) 60 | return (size/1024/1024) 61 | 62 | #获取文件夹总文件大小 63 | #返回元组 (文件总大小,文件数量) 64 | def getDirSize(self,dirPath): 65 | filesize = 0 66 | fileCount = 0 67 | if self.isExistsFilePath(dirPath) == False: 68 | return (0,0) 69 | else: 70 | 71 | # 三个参数:分别返回1.父目录 2.所有文件夹名字(不含路径) 3.所有文件名字 72 | for parent , dirnames , filenames in os.walk(dirPath): 73 | 74 | # 文件信息 75 | for filename in filenames: 76 | currentPath = os.path.join(parent, filename) 77 | #排除文件夹 78 | if (os.path.isdir(currentPath)) == False: 79 | #获取多少M 80 | if len(fileTypeBlackList) > 0 : 81 | for type in fileTypeBlackList: 82 | if currentPath in type: 83 | continue 84 | else: 85 | filesize += os.path.getsize(currentPath) / 1024 / 1024 86 | fileCount += 1 87 | else: 88 | filesize += os.path.getsize(currentPath) / 1024 / 1024 89 | fileCount += 1 90 | 91 | return (filesize , fileCount) 92 | -------------------------------------------------------------------------------- /javbus/Utils/JAVBusMySQLDBManager.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf-8 -*- 2 | 3 | import pymysql 4 | from javbus.Utils import syLogger 5 | import re 6 | 7 | # 单例模式 8 | def singleton(cls, *args, **kw): 9 | instances = {} 10 | 11 | def _singleton(): 12 | if cls not in instances: 13 | instances[cls] = cls(*args, **kw) 14 | return instances[cls] 15 | 16 | return _singleton 17 | 18 | 19 | # 数据库记录类 20 | @singleton 21 | class dbManager(): 22 | javDBName = 'JavBusPython' 23 | javTableName = 'javBusTable' 24 | DBHost = "localhost" 25 | DBPort = 3306 26 | DBUser = 'root' 27 | DBPassWord = '22f25f9d81f4d21d' 28 | DBCharset = 'utf8' 29 | 30 | 31 | def __init__(self): 32 | self.logger = syLogger.syLoggerManager() 33 | 34 | 35 | def __executeStrings(self,sqlCommandsList,cursor): 36 | if len(sqlCommandsList) == 0 or cursor is None: 37 | return 38 | try: 39 | for sql in sqlCommandsList: 40 | cursor.execute(sql) 41 | except Exception as e: 42 | self.logger.syLog(str(e)) 43 | 44 | # 执行多sql语句 45 | def executeSQLArray(self, sqlStringArray): 46 | if len(sqlStringArray) == 0: 47 | print ('SQLString数据为空') 48 | return 49 | conn = pymysql.connect(self.DBHost, self.DBUser, self.DBPassWord, self.javDBName, 50 | charset=self.DBCharset) 51 | cursor = conn.cursor() 52 | try: 53 | for sql in sqlStringArray: 54 | cursor.execute(sql) 55 | print('sql执行成功') 56 | except Exception as e: 57 | self.logger.syLog(str(e)) 58 | finally: 59 | cursor.close() 60 | conn.commit() 61 | conn.close() 62 | 63 | 64 | # 执行命令 65 | def execute(self, executeString): 66 | fetchResult = [] 67 | conn = pymysql.connect(self.DBHost,self.DBUser,self.DBPassWord,self.javDBName,charset=self.DBCharset) 68 | cursor = conn.cursor() 69 | try: 70 | results = cursor.execute(executeString) 71 | fetchResult = cursor.fetchall(); 72 | print('sql执行成功') 73 | # print('executeString:%s'%executeString) 74 | except Exception as e: 75 | self.logger.syLog(str(e)) 76 | cursor.close() 77 | conn.commit() 78 | conn.close() 79 | # self.dbLock.release() 80 | return fetchResult 81 | 82 | 83 | 84 | 85 | # 执行命令(字典返回) 86 | def executeWithDictReturn(self, executeString): 87 | fetchResult = [] 88 | conn = pymysql.connect(self.DBHost,self.DBUser,self.DBPassWord,self.javDBName,charset=self.DBCharset) 89 | cursor = conn.cursor(cursor=pymysql.cursors.DictCursor) 90 | try: 91 | results = cursor.execute(executeString) 92 | fetchResult = cursor.fetchall(); 93 | print('sql字典执行成功') 94 | except Exception as e: 95 | self.logger.syLog(str(e)) 96 | 97 | cursor.close() 98 | conn.commit() 99 | conn.close() 100 | return fetchResult 101 | 102 | -------------------------------------------------------------------------------- /JAVBusImageDownloader.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf-8 -*- 2 | 3 | import requests 4 | import os 5 | import queue 6 | import sys 7 | from javbus.Utils import syFileOperator 8 | from javbus.Utils import syLogger 9 | from javbus.Utils import JAVBusMySQLDBManager 10 | from javbus.Utils import JAVBusImageDownloadQueue 11 | 12 | 13 | 14 | class JAVImageDownloader(): 15 | 16 | 17 | def __init__(self): 18 | 19 | self.fileOperator = syFileOperator.syFileOperator() 20 | self.dbManager = JAVBusMySQLDBManager.dbManager() 21 | self.logger = syLogger.syLoggerManager() 22 | # 创建资源目录 23 | # PYJAVBUS ... (项目文件根)javbus (同级) /javbus / Utils / currentPath 24 | self.sourcePath = os.path.join(os.path.dirname(os.path.dirname(os.path.dirname(self.fileOperator.currentPath))),'PYJAVBUS') 25 | if self.fileOperator.isExistsFilePath(self.sourcePath) == False: 26 | self.fileOperator.createDirPath(self.sourcePath) 27 | #Queue 28 | self.downloadImageQueue = queue.Queue(maxsize = 5) 29 | #Resource 30 | self.downloadSource = [] 31 | 32 | 33 | def startDownLoadImage(self): 34 | print('SourceDownloadPath:%s'%self.sourcePath) 35 | sqlString = '''SELECT * FROM %s'''%self.dbManager.javTableName 36 | items = self.dbManager.executeWithDictReturn(sqlString) 37 | if len(sqlString) == 0 or sqlString is None: 38 | self.logger.syLog('无下载资源') 39 | return 40 | 41 | for item in items: 42 | if len(item['samplePic']) == 0 or item['samplePic'] is None or len(item['cover']) == 0: 43 | self.logger.syLog('无样例图片') 44 | continue 45 | rootPath = item['code'] 46 | rootPath = os.path.join(self.sourcePath, rootPath) 47 | if self.fileOperator.isExistsFilePath(rootPath) == False: 48 | self.fileOperator.createDirPath(rootPath) 49 | # 样品图 50 | samplePic = item['samplePic'] 51 | #如果没有图片则不用添加 52 | if samplePic is not None and len(samplePic) > 0: 53 | samplePicArray = samplePic.split('||') 54 | for picURLAddress in samplePicArray: 55 | downInfoDict = { 56 | 57 | "url":str(picURLAddress), 58 | "code":str(item['code']), 59 | "type":"samplePic" 60 | 61 | } 62 | self.downloadSource.append(downInfoDict) 63 | 64 | if item['cover'] is not None : 65 | downInfoDict = { 66 | 67 | "url":str(item['cover']), 68 | "code":str(item['code']), 69 | "type":"cover" 70 | 71 | } 72 | self.downloadSource.append(downInfoDict) 73 | 74 | #多线程下载 75 | for i in range(10): 76 | queue = JAVBusImageDownloadQueue.javBusImageDownloadQueue(self.downloadImageQueue) 77 | queue.setDaemon(True) 78 | queue.start() 79 | for i in range(len(self.downloadSource)): 80 | self.downloadImageQueue.put(self.downloadSource[i]) 81 | 82 | 83 | self.downloadImageQueue.join() 84 | 85 | 86 | 87 | 88 | 89 | if __name__ == '__main__': 90 | sys.path.append(os.path.dirname(os.path.realpath(__file__))) 91 | imageDownloader = JAVImageDownloader() 92 | imageDownloader.startDownLoadImage() 93 | print('结束清除SWAP') 94 | os.system('date ; sleep 10 ; swapoff -a && swapon -a ; date') 95 | print('清除SWAP成功') -------------------------------------------------------------------------------- /javbus/middlewares.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define here the models for your spider middleware 4 | # 5 | # See documentation in: 6 | # https://doc.scrapy.org/en/latest/topics/spider-middleware.html 7 | 8 | from scrapy import signals 9 | 10 | 11 | class JavbusSpiderMiddleware(object): 12 | # Not all methods need to be defined. If a method is not defined, 13 | # scrapy acts as if the spider middleware does not modify the 14 | # passed objects. 15 | 16 | @classmethod 17 | def from_crawler(cls, crawler): 18 | # This method is used by Scrapy to create your spiders. 19 | s = cls() 20 | crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) 21 | return s 22 | 23 | def process_spider_input(self, response, spider): 24 | # Called for each response that goes through the spider 25 | # middleware and into the spider. 26 | 27 | # Should return None or raise an exception. 28 | return None 29 | 30 | def process_spider_output(self, response, result, spider): 31 | # Called with the results returned from the Spider, after 32 | # it has processed the response. 33 | 34 | # Must return an iterable of Request, dict or Item objects. 35 | for i in result: 36 | yield i 37 | 38 | def process_spider_exception(self, response, exception, spider): 39 | # Called when a spider or process_spider_input() method 40 | # (from other spider middleware) raises an exception. 41 | 42 | # Should return either None or an iterable of Response, dict 43 | # or Item objects. 44 | pass 45 | 46 | def process_start_requests(self, start_requests, spider): 47 | # Called with the start requests of the spider, and works 48 | # similarly to the process_spider_output() method, except 49 | # that it doesn’t have a response associated. 50 | 51 | # Must return only requests (not items). 52 | for r in start_requests: 53 | yield r 54 | 55 | def spider_opened(self, spider): 56 | spider.logger.info('Spider opened: %s' % spider.name) 57 | 58 | 59 | class JavbusDownloaderMiddleware(object): 60 | # Not all methods need to be defined. If a method is not defined, 61 | # scrapy acts as if the downloader middleware does not modify the 62 | # passed objects. 63 | 64 | @classmethod 65 | def from_crawler(cls, crawler): 66 | # This method is used by Scrapy to create your spiders. 67 | s = cls() 68 | crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) 69 | return s 70 | 71 | def process_request(self, request, spider): 72 | # Called for each request that goes through the downloader 73 | # middleware. 74 | 75 | # Must either: 76 | # - return None: continue processing this request 77 | # - or return a Response object 78 | # - or return a Request object 79 | # - or raise IgnoreRequest: process_exception() methods of 80 | # installed downloader middleware will be called 81 | return None 82 | 83 | def process_response(self, request, response, spider): 84 | # Called with the response returned from the downloader. 85 | 86 | # Must either; 87 | # - return a Response object 88 | # - return a Request object 89 | # - or raise IgnoreRequest 90 | return response 91 | 92 | def process_exception(self, request, exception, spider): 93 | # Called when a download handler or a process_request() 94 | # (from other downloader middleware) raises an exception. 95 | 96 | # Must either: 97 | # - return None: continue processing this exception 98 | # - return a Response object: stops process_exception() chain 99 | # - return a Request object: stops process_exception() chain 100 | pass 101 | 102 | def spider_opened(self, spider): 103 | spider.logger.info('Spider opened: %s' % spider.name) 104 | -------------------------------------------------------------------------------- /javbus/settings.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Scrapy settings for javbus project 4 | # 5 | # For simplicity, this file contains only settings considered important or 6 | # commonly used. You can find more settings consulting the documentation: 7 | # 8 | # https://doc.scrapy.org/en/latest/topics/settings.html 9 | # https://doc.scrapy.org/en/latest/topics/downloader-middleware.html 10 | # https://doc.scrapy.org/en/latest/topics/spider-middleware.html 11 | import os 12 | 13 | BOT_NAME = 'javbus' 14 | 15 | SPIDER_MODULES = ['javbus.spiders'] 16 | NEWSPIDER_MODULE = 'javbus.spiders' 17 | SPLASH_URL = 'http://localhost:8050' 18 | DUPEFILTER_CLASS = 'scrapy_splash.SplashAwareDupeFilter' 19 | HTTPCACHE_STORAGE = 'scrapy_splash.SplashAwareFSCacheStorage' 20 | 21 | # Crawl responsibly by identifying yourself (and your website) on the user-agent 22 | #USER_AGENT = 'javbus (+http://www.yourdomain.com)' 23 | 24 | # Obey robots.txt rules 25 | ROBOTSTXT_OBEY = False 26 | 27 | # Configure maximum concurrent requests performed by Scrapy (default: 16) 28 | #CONCURRENT_REQUESTS = 32 29 | CONCURRENT_ITEMS = 60 30 | 31 | # Configure a delay for requests for the same website (default: 0) 32 | # See https://doc.scrapy.org/en/latest/topics/settings.html#download-delay 33 | # See also autothrottle settings and docs 34 | DOWNLOAD_DELAY = 0.25 35 | # The download delay setting will honor only one of: 36 | #CONCURRENT_REQUESTS_PER_DOMAIN = 16 37 | #CONCURRENT_REQUESTS_PER_IP = 16 38 | 39 | # Disable cookies (enabled by default) 40 | #COOKIES_ENABLED = False 41 | 42 | # Disable Telnet Console (enabled by default) 43 | #TELNETCONSOLE_ENABLED = False 44 | 45 | # Override the default request headers: 46 | #DEFAULT_REQUEST_HEADERS = { 47 | # 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 48 | # 'Accept-Language': 'en', 49 | #} 50 | 51 | # Enable or disable spider middlewares 52 | # See https://doc.scrapy.org/en/latest/topics/spider-middleware.html 53 | SPIDER_MIDDLEWARES = { 54 | # 'javbus.middlewares.JavbusSpiderMiddleware': 543, 55 | 'scrapy_splash.SplashDeduplicateArgsMiddleware': 100, 56 | } 57 | 58 | 59 | # Enable or disable downloader middlewares 60 | # See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html 61 | DOWNLOADER_MIDDLEWARES = { 62 | # 'javbus.middlewares.JavbusDownloaderMiddleware': 543, 63 | # 'javbus.middlewares.JavBusMiddleware.JavbusMiddleware': 543, 64 | 65 | 'scrapy_splash.SplashCookiesMiddleware': 723, 66 | 'scrapy_splash.SplashMiddleware': 725, 67 | 'scrapy.downloadermiddlewares.httpcompression.HttpCompressionMiddleware': 810, 68 | } 69 | 70 | 71 | # Enable or disable extensions 72 | # See https://doc.scrapy.org/en/latest/topics/extensions.html 73 | #EXTENSIONS = { 74 | # 'scrapy.extensions.telnet.TelnetConsole': None, 75 | #} 76 | 77 | # Configure item pipelines 78 | # See https://doc.scrapy.org/en/latest/topics/item-pipeline.html 79 | ITEM_PIPELINES = { 80 | 'javbus.pipelines.JavbusPipeline': 300, 81 | # 'javbus.javImagesPipeline.JavImgDownloadPipeline': 301, 82 | } 83 | 84 | #设置图片下载路径 85 | IMAGES_STORE = os.path.join(os.path.dirname(os.getcwd()),'PYJAVBUS') 86 | # 过期天数 87 | IMAGES_EXPIRES = 10 #90天内抓取的都不会被重抓 88 | 89 | 90 | # Enable and configure the AutoThrottle extension (disabled by default) 91 | # See https://doc.scrapy.org/en/latest/topics/autothrottle.html 92 | #AUTOTHROTTLE_ENABLED = True 93 | # The initial download delay 94 | #AUTOTHROTTLE_START_DELAY = 5 95 | # The maximum download delay to be set in case of high latencies 96 | #AUTOTHROTTLE_MAX_DELAY = 60 97 | # The average number of requests Scrapy should be sending in parallel to 98 | # each remote server 99 | #AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0 100 | # Enable showing throttling stats for every response received: 101 | #AUTOTHROTTLE_DEBUG = False 102 | 103 | # Enable and configure HTTP caching (disabled by default) 104 | # See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings 105 | #HTTPCACHE_ENABLED = True 106 | #HTTPCACHE_EXPIRATION_SECS = 0 107 | #HTTPCACHE_DIR = 'httpcache' 108 | #HTTPCACHE_IGNORE_HTTP_CODES = [] 109 | #HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage' 110 | -------------------------------------------------------------------------------- /javbus/Utils/JAVBusImageDownloadQueue.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf-8 -*- 2 | import queue 3 | import threading 4 | import requests 5 | from javbus.Utils import syLogger 6 | from javbus.Utils import syFileOperator 7 | import os 8 | 9 | class javBusImageDownloadQueue(threading.Thread): 10 | def __init__(self,queue): 11 | threading.Thread.__init__(self) 12 | self.queue = queue 13 | self.downloadLock = threading.Lock() 14 | self.fileOperator = syFileOperator.syFileOperator() 15 | self.logger = syLogger.syLoggerManager() 16 | # 资源目录 17 | self.sourcePath = os.path.join(os.path.dirname(os.path.dirname(os.path.dirname(self.fileOperator.currentPath))),'PYJAVBUS') 18 | self.logInfo = [] 19 | 20 | 21 | def run(self): 22 | while True: 23 | item = self.queue.get() 24 | rootPath = item['code'] 25 | rootPath = os.path.join(self.sourcePath, rootPath) 26 | if self.fileOperator.isExistsFilePath(rootPath) == False: 27 | self.fileOperator.createDirPath(rootPath) 28 | url = item['url'] 29 | 30 | #封面图 31 | if item['type'] == 'cover': 32 | coverURL = url 33 | coverSplit = coverURL.split('/') 34 | cover_name = str(coverSplit[len(coverSplit) - 1]) 35 | coverPath = os.path.join(rootPath, cover_name) 36 | if self.fileOperator.isExistsFilePath(coverPath) == True: 37 | info = '存在 %s 封面 %s ' % (str(item['code']), cover_name) 38 | print(info) 39 | self.logInfo.append(info) 40 | self.queue.empty() 41 | self.queue.task_done() 42 | else: 43 | try: 44 | ir = requests.get(coverURL, timeout=2) 45 | except Exception as e: 46 | info = '下载 %s 封面 %s 错误 Error : %s ' % (str(item['code']), cover_name, str(e)) 47 | print(info) 48 | self.logInfo.append(info) 49 | self.queue.task_done() 50 | if ir.status_code == 200: 51 | with open(coverPath, 'wb') as f: 52 | f.write(ir.content) 53 | f.close() 54 | info = '下载 %s 封面 %s 成功' % (str(item['code']), cover_name) 55 | self.logInfo.append(info) 56 | print('===>写入 %s 封面 %s 成功' % (str(item['code']), cover_name)) 57 | print('CoverPath:%s'%str(coverPath)) 58 | self.queue.task_done() 59 | 60 | 61 | 62 | #样例图片下载 63 | elif item['type'] == 'samplePic': 64 | list_name = url.split('/') 65 | # 图片名称 66 | file_name = str(list_name[len(list_name) - 1]) 67 | filePath = os.path.join(rootPath, file_name) 68 | # 存在图片则不用下载 69 | if self.fileOperator.isExistsFilePath(filePath) == True: 70 | info = '存在 %s 样品图 %s' % (str(item['code']), file_name) 71 | self.logInfo.append(info) 72 | print(info) 73 | self.queue.empty() 74 | self.queue.task_done() 75 | else: 76 | try: 77 | ir = requests.get(url, timeout=2) 78 | except Exception as e: 79 | info = '下载 %s 样品图 %s 错误 Error : %s ' % (str(item['code']), file_name, str(e)) 80 | print(info) 81 | self.logInfo.append(info) 82 | self.queue.task_done() 83 | if ir.status_code == 200: 84 | with open(filePath, 'wb') as f: 85 | f.write(ir.content) 86 | f.close() 87 | info = '下载 %s 样品图 %s 成功' % (str(item['code']), file_name) 88 | self.logInfo.append(info) 89 | print('下载 %s 样品图 %s 成功' % (str(item['code']), file_name)) 90 | print('===>写入 %s 样品图 %s 成功' % (str(item['code']), file_name)) 91 | print('CoverPath:%s' % str(filePath)) 92 | self.queue.task_done() 93 | 94 | 95 | def _stop(self): 96 | # self.logger.syLogManyLines(self.logInfo) 97 | print('End Queue') 98 | -------------------------------------------------------------------------------- /javbus/spiders/JavbusSpider.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import scrapy 3 | import os 4 | import time 5 | from scrapy.http import Request 6 | from javbus.items import JavbusItem 7 | from scrapy_splash import SplashRequest 8 | 9 | 10 | class JavbusspiderSpider(scrapy.Spider): 11 | #关闭Docker 12 | print('关闭DOCKER!') 13 | os.system('docker kill splash') 14 | os.system('docker rm splash') 15 | print('关闭DOCKER命令完!') 16 | 17 | time.sleep(1) 18 | 19 | #启动Docker 20 | print('启动DOCKER!') 21 | os.system('nohup docker run --name splash -p 8050:8050 scrapinghub/splash > /dev/null 2>&1 &') 22 | print('启动DOCKER命令完!') 23 | 24 | time.sleep(3) 25 | 26 | print('启动爬虫') 27 | 28 | name = 'JavbusSpider' 29 | allowed_domains = ['www.javbus.cc'] 30 | start_urls = ['https://www.javbus.cc/page/1'] 31 | 32 | header = {'user-agent':'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36'} 33 | 34 | 35 | def start_requests(self): 36 | return [Request(self.start_urls[0],callback=self.parse,headers=self.header)] 37 | 38 | def parse(self, response): 39 | 40 | javList = response.xpath('//a[@class = "movie-box"]/@href').extract() 41 | for javItem in javList: 42 | # request = Request(javItem,callback=self.parse_inner) 43 | request = SplashRequest(javItem, callback=self.parse_inner,args={'wait': 10}) 44 | yield request 45 | for i in range(1,2,1): 46 | page_url = 'https://www.javbus.cc/page/{}'.format(i) 47 | yield Request(page_url,callback=self.parse,meta={'download_timeout' : 10}) 48 | 49 | 50 | def parse_inner(self,response): 51 | item = JavbusItem() 52 | #数组转字符 53 | #标题 54 | item['title'] = response.xpath('//div[@class = "container"]/h3/text()').extract() 55 | item['title'] = item['title'][0].replace('\t', "").replace('\n', "").replace(' ', "") 56 | #封面 57 | item['cover'] = response.xpath('//a[@class = "bigImage"]/@href').extract() 58 | item['cover'] = item['cover'][0] 59 | #番号 60 | item['code'] = response.xpath('//div[@class = "col-md-3 info"]/ p[1] / span[2]/text()').extract() 61 | item['code'] = item['code'][0] 62 | #发布日期 63 | item['date'] = response.xpath('//div[@class = "col-md-3 info"]/ p[2] / text()').extract() 64 | item['date'] = item['date'][0].replace('\t', "").replace('\n', "").replace(' ', "") 65 | #时长 66 | item['duration'] = response.xpath('//div[@class = "col-md-3 info"]/ p[3] /text()').extract() 67 | item['duration'] = item['duration'][0].replace('\t', "").replace('\n', "").replace(' ', "") 68 | #系列 69 | item['series'] = response.xpath('//span[@class = "genre"]/a/text()').extract() 70 | item['series'] = "||".join(item['series']) 71 | #类型 72 | item['type'] = response.xpath('//span[@class = "genre"]/a/text()').extract() 73 | item['type'] = "||".join(item['type']) 74 | #演员 75 | item['actress'] = response.xpath('//span[@class = "star-toggle"]/text()').extract() 76 | if item['actress'] is None or len(item['actress']) == 0: 77 | item['actress'] = '' 78 | else: 79 | item['actress'] = item['actress'][0] 80 | #封面图 81 | item['samplePic'] = response.xpath('//a[@class = "sample-box"]/@href').extract() 82 | item['samplePic'] = "||".join(item['samplePic']) 83 | #连接 84 | item['link'] = response.url 85 | #去重排序 86 | item['magnet'] = response.xpath('//table[@id = "magnet-table"]//tr//td/a/@href').extract() 87 | newMagnet = [] 88 | for id in item['magnet']: 89 | if id not in newMagnet: 90 | newMagnet.append(id) 91 | item['magnet'] = "||".join(newMagnet) 92 | 93 | #去空格 94 | item['size'] = response.xpath('//table[@id = "magnet-table"]//tr/td[2]/a/text()').extract() 95 | item['size'] = ("||".join(item['size'])).strip() 96 | item['size'] = item['size'].replace('\t',"").replace('\n',"").replace(' ',"") 97 | yield item 98 | 99 | @staticmethod 100 | def close(spider, reason): 101 | #关闭Spider 102 | print('关闭Spider') 103 | print('关闭DOCKER!') 104 | os.system('docker kill splash') 105 | os.system('docker rm splash') 106 | print('关闭DOCKER命令完!') 107 | os.system('date ; sleep 10 ; swapoff -a && swapon -a ; date') 108 | closed = getattr(spider, 'closed', None) 109 | if callable(closed): 110 | return closed(reason) 111 | 112 | -------------------------------------------------------------------------------- /javbus/pipelines.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define your item pipelines here 4 | # 5 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting 6 | # See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html 7 | import os 8 | import pymysql 9 | import requests 10 | from scrapy.exceptions import DropItem 11 | 12 | 13 | class JavbusPipeline(object): 14 | 15 | def __init__(self): 16 | pass 17 | 18 | 19 | @classmethod 20 | def from_settings(cls, settings): 21 | print("Spider Singleton!") 22 | return cls() # 相当于conn付给了这个类,self中可以得到 23 | 24 | def open_spider(self, spider): 25 | print('open pipelines') 26 | javDBName = 'JavBusPython' 27 | DBHost = "localhost" 28 | DBPort = 3306 29 | DBUser = 'root' 30 | DBPassWord = '22f25f9d81f4d21d' 31 | DBCharset = 'utf8' 32 | self.dbConn = pymysql.connect(DBHost, DBUser, DBPassWord, javDBName, charset=DBCharset) 33 | # conn = pymysql.connect(host = DBHost, user = DBUser, passwd = DBPassWord , port = DBPort ,charset=DBCharset) 34 | self.cursor = self.dbConn.cursor() 35 | # cursor.execute('''CREATE DATABASE IF NOT EXISTS JavBusPython''') 36 | self.cursor.execute('''CREATE TABLE IF NOT EXISTS javBusTable( 37 | id INTEGER PRIMARY KEY auto_increment, 38 | title TEXT, 39 | cover TEXT, 40 | code TEXT, 41 | date TEXT, 42 | duration TEXT, 43 | series TEXT, 44 | type TEXT, 45 | actress TEXT, 46 | magnet TEXT, 47 | size TEXT, 48 | samplePic TEXT, 49 | link TEXT, 50 | LastIndexFlag TEXT 51 | )''') 52 | self.cursor.execute("set names 'utf8'") 53 | 54 | 55 | def close_spider(self, spider): 56 | self.dbConn.commit() 57 | self.cursor.close() 58 | self.dbConn.close() 59 | print("Spider Done!") 60 | 61 | 62 | def process_item(self, item, spider): 63 | 64 | self.updateOrInsertItem(item,spider) 65 | 66 | return item 67 | 68 | def updateOrInsertItem(self,item,spider): 69 | if item['code'] is None or item['code'] =='' : 70 | raise DropItem("Missing Content ") 71 | 72 | 73 | sqlString = ''' SELECT * FROM javBusTable where code = '%s' ''' % item['code'] 74 | self.cursor.execute(sqlString) 75 | res = self.cursor.fetchall() 76 | 77 | title = item['title'] 78 | cover = item['cover'] 79 | code = item['code'] 80 | date = item['date'] 81 | duration = item['duration'] 82 | series = item['series'] 83 | type = item['type'] 84 | actress = item['actress'] 85 | magnet = item['magnet'] 86 | size = item['size'] 87 | samplePic = item['samplePic'] 88 | link = item['link'] 89 | 90 | if len(res) == 0: 91 | #插入状态 92 | print('insert') 93 | sqlInsertString = "INSERT INTO javBusTable (title,cover,code,date,duration,series,type,actress,magnet,size,samplePic,link,LastIndexFlag) VALUES ('%s','%s','%s','%s','%s','%s','%s','%s','%s','%s','%s','%s','%s')" % (title, cover, code, date, duration, series, type, actress, magnet, size, samplePic, link, "") 94 | try: 95 | self.cursor.execute(sqlInsertString.encode('utf8')) 96 | except Exception as e: 97 | print(e) 98 | self.dbConn.rollBack() 99 | 100 | 101 | else: 102 | #更新状态 103 | print('update') 104 | sqlUpdateString = '''update javBusTable set title = '%s' , cover = '%s' , date = '%s' , duration = '%s' , series = '%s' , type = '%s' , actress = '%s' , magnet = '%s' , size = '%s' , samplePic = '%s' , link = '%s' WHERE code = '%s' ''' %(title,cover,date,duration,series,type,actress,magnet,size,samplePic,link,code) 105 | try: 106 | self.cursor.execute(sqlUpdateString.encode('utf8')) 107 | except Exception as e: 108 | print(e) 109 | self.dbConn.rollBack() 110 | 111 | self.dbConn.commit() 112 | # self.downloadImagesWithItem(item = item) 113 | 114 | 115 | def _handle_error(self, failure, item, spider): 116 | print(failure) 117 | 118 | 119 | 120 | --------------------------------------------------------------------------------