├── javbus
    ├── __init__.py
    ├── Utils
    │   ├── __init__.py
    │   ├── syLogger.py
    │   ├── syFileOperator.py
    │   ├── JAVBusMySQLDBManager.py
    │   └── JAVBusImageDownloadQueue.py
    ├── middlewares
    │   └── __init__.py
    ├── spiders
    │   ├── __init__.py
    │   └── JavbusSpider.py
    ├── items.py
    ├── middlewares.py
    ├── settings.py
    └── pipelines.py
├── require.txt
├── scrapy.cfg
├── .gitignore
├── README.md
└── JAVBusImageDownloader.py


/javbus/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/javbus/Utils/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/javbus/middlewares/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/require.txt:
--------------------------------------------------------------------------------
1 | requests
2 | pymysql
3 | scrapy
4 | scrapy-splash
5 | scrapyd
6 | scrapyd-client


--------------------------------------------------------------------------------
/javbus/spiders/__init__.py:
--------------------------------------------------------------------------------
1 | # This package will contain the spiders of your Scrapy project
2 | #
3 | # Please refer to the documentation for information on how to create and manage
4 | # your spiders.
5 | 


--------------------------------------------------------------------------------
/scrapy.cfg:
--------------------------------------------------------------------------------
 1 | # Automatically created by: scrapy startproject
 2 | #
 3 | # For more information about the [deploy] section see:
 4 | # https://scrapyd.readthedocs.io/en/latest/deploy.html
 5 | 
 6 | [settings]
 7 | default = javbus.settings
 8 | 
 9 | [deploy]
10 | url = http://localhost:6800/
11 | project = javbus
12 | 


--------------------------------------------------------------------------------
/javbus/items.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Define here the models for your scraped items
 4 | #
 5 | # See documentation in:
 6 | # https://doc.scrapy.org/en/latest/topics/items.html
 7 | 
 8 | import scrapy
 9 | 
10 | 
11 | class JavbusItem(scrapy.Item):
12 |     # define the fields for your item here like:
13 |     # name = scrapy.Field()
14 |     title = scrapy.Field()
15 |     cover = scrapy.Field()
16 |     code = scrapy.Field()
17 |     date = scrapy.Field()
18 |     duration = scrapy.Field()
19 |     series = scrapy.Field()
20 |     type = scrapy.Field()
21 |     actress = scrapy.Field()
22 |     magnet = scrapy.Field()
23 |     size = scrapy.Field()
24 |     samplePic = scrapy.Field()
25 |     link = scrapy.Field()
26 | 
27 | 
28 | class JavBusImageItem(scrapy.Item):
29 |     image_urls = scrapy.Field()
30 |     images = scrapy.Field()
31 |     image_paths = scrapy.Field()


--------------------------------------------------------------------------------
/javbus/Utils/syLogger.py:
--------------------------------------------------------------------------------
 1 | # -*- coding:utf-8 -*-
 2 | import os
 3 | import datetime
 4 | 
 5 | 
 6 | #单例模式
 7 | def singleton(cls, *args, **kw):
 8 |     instances = {}
 9 |     def _singleton():
10 |         if cls not in instances:
11 |             instances[cls] = cls(*args, **kw)
12 |         return instances[cls]
13 |     return _singleton
14 | 
15 | 
16 | #log记录类
17 | @singleton
18 | class syLoggerManager():
19 | 
20 |     isDebugMode = True
21 | 
22 |     def __init__(self):
23 | 
24 |         print ('Logger Init -----> if console shows many times , there is a bug check!')
25 | 
26 |         # self.logFilePath = os.getcwd()
27 |         self.logFilePath = os.path.dirname(os.path.realpath(__file__))
28 |         self.logFilePath = os.path.join(self.logFilePath , 'log.txt')
29 |         if os.path.exists(self.logFilePath):
30 |             #存在log文件
31 |             #py3.0
32 |             # f = open(self.logFilePath,'a',encoding='utf8')
33 |             #py2.0
34 |             f = open(self.logFilePath, 'a')
35 |             f.write('\n\n今天日志' + '==============' +str(datetime.date.today()) + '==============')
36 |         else:
37 |             f = open(self.logFilePath,'w')
38 |             f.write('今天日志' + '==============' +str(datetime.date.today()) + '==============')
39 |             f.close()
40 | 
41 | 
42 |     def syLog(self,info):
43 |         if info is None:
44 |             return
45 |         if self.isDebugMode == True:
46 |             f = open(self.logFilePath,'a')
47 |             f.write('\n %s =====>  %s'%(str(datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')) , info))
48 |             f.close()
49 | 
50 |     def syLogManyLines(self,infoArray):
51 |         if infoArray is None:
52 |             return
53 |         if self.isDebugMode == True:
54 |             f = open(self.logFilePath, 'a')
55 |             for info in infoArray:
56 | 
57 |                 f.write('\n %s' % (info))
58 |             f.close()


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | # ---> Python
  2 | # Byte-compiled / optimized / DLL files
  3 | __pycache__/
  4 | *.py[cod]
  5 | *$py.class
  6 | .idea
  7 | 
  8 | 
  9 | # C extensions
 10 | *.so
 11 | 
 12 | # Distribution / packaging
 13 | .Python
 14 | env/
 15 | build/
 16 | develop-eggs/
 17 | dist/
 18 | downloads/
 19 | eggs/
 20 | .eggs/
 21 | lib/
 22 | lib64/
 23 | parts/
 24 | sdist/
 25 | var/
 26 | *.egg-info/
 27 | .installed.cfg
 28 | *.egg
 29 | 
 30 | # PyInstaller
 31 | #  Usually these files are written by a python script from a template
 32 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 33 | *.manifest
 34 | *.spec
 35 | 
 36 | # Installer logs
 37 | pip-log.txt
 38 | pip-delete-this-directory.txt
 39 | 
 40 | # Unit test / coverage reports
 41 | htmlcov/
 42 | .tox/
 43 | .coverage
 44 | .coverage.*
 45 | .cache
 46 | nosetests.xml
 47 | coverage.xml
 48 | *,cover
 49 | 
 50 | # Translations
 51 | *.mo
 52 | *.pot
 53 | 
 54 | # Django stuff:
 55 | *.log
 56 | 
 57 | # Sphinx documentation
 58 | docs/_build/
 59 | 
 60 | # PyBuilder
 61 | target/
 62 | 
 63 | # ---> Objective-C
 64 | # Xcode
 65 | #
 66 | # gitignore contributors: remember to update Global/Xcode.gitignore, Objective-C.gitignore & Swift.gitignore
 67 | 
 68 | ## Build generated
 69 | build/
 70 | DerivedData
 71 | 
 72 | ## Various settings
 73 | *.pbxuser
 74 | !default.pbxuser
 75 | *.mode1v3
 76 | !default.mode1v3
 77 | *.mode2v3
 78 | !default.mode2v3
 79 | *.perspectivev3
 80 | !default.perspectivev3
 81 | xcuserdata
 82 | 
 83 | ## Other
 84 | *.xccheckout
 85 | *.moved-aside
 86 | *.xcuserstate
 87 | *.xcscmblueprint
 88 | 
 89 | ## Obj-C/Swift specific
 90 | *.hmap
 91 | *.ipa
 92 | 
 93 | # CocoaPods
 94 | #
 95 | # We recommend against adding the Pods directory to your .gitignore. However
 96 | # you should judge for yourself, the pros and cons are mentioned at:
 97 | # http://guides.cocoapods.org/using/using-cocoapods.html#should-i-check-the-pods-directory-into-source-control
 98 | #
 99 | #Pods/
100 | 
101 | # Carthage
102 | #
103 | # Add this line if you want to avoid checking in source code from Carthage dependencies.
104 | # Carthage/Checkouts
105 | 
106 | Carthage/Build
107 | 
108 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # pyJAVBus
  2 | 
  3 | ### Description:
  4 | `JAVBus` 爬虫项目练手项目,老司机专属
  5 | 
  6 | 
  7 | ### Ajax抓取使用注意事项:
  8 | Python 使用 'scrapy-splash' 库
  9 | Docker 使用 `scrapinghub/splash`
 10 | 
 11 | ### Warning:
 12 | **CentOS Only**
 13 | **Python3 Only**
 14 | 1. 脚本中运行Docker与清除Swap功能仅在 `Centos` 系统下运用
 15 | 2. 请在本地数据添加 `JavBusPython` 数据库
 16 | 3. 数据库要求'UTF8'格式,可通过 `phpAdmin`添加,如果格式不对，日文无法插入
 17 | 
 18 | ### USAGE:
 19 | 1. 先安装Python依赖库 
 20 | 
 21 | ```python
 22 | pip install -r require.txt
 23 | ```
 24 | 2.安装 Docker Image 
 25 | 
 26 | ```c
 27 | docker pull scrapinghub/splash
 28 | ```
 29 | 
 30 | ~~3. 运行 Docker~~
 31 | (脚本集成了开启Docker功能 与 清除SwapMemory)功能
 32 | 
 33 | ```c
 34 | docker run -p 8050:8050 scrapinghub/splash
 35 | ```
 36 | 
 37 | 4.运行爬虫
 38 | 
 39 | ```python
 40 | python -m scrapy crawl JavbusSpider
 41 | ```
 42 | 
 43 | ### PS: 需要修改地方:
 44 | `pipelines.py` 中数据库信息
 45 | `JavbusSpider.py` 中爬取页面深度,默认第一第二页
 46 | 
 47 | ### 关于部署(非必须,可手动开启脚本):
 48 | 
 49 | 1. 使用 `scrapyd` 与 `scrapyd-client` , 已经包含在 `require.txt` 中
 50 | 
 51 | 2. 开启 `Scrapyd` 服务
 52 | 
 53 | ```c
 54 | scrapyd
 55 | ```
 56 | 
 57 | 或者后台开启
 58 | 
 59 | ```c
 60 | nohup scrapyd > /dev/null 2>&1 &
 61 | ```
 62 | 
 63 | 开启后可通过 `http://localhost:6800` 访问
 64 | 
 65 | `如果部署在远程服务器，需要访问的话，需要修改Package下的config`
 66 | 
 67 | 如:
 68 | 使用 `pip` 安装 , 则在 `Python` 安装环境库下 `xxx(python库路径)/site-packages/scrapyd/default_scrapyd.conf`
 69 | 
 70 | 修改 'bind_address = 0.0.0.0' , 然后开启防火墙规则,然后其他远侧会给你电脑能通过 `http://服务器ip:6800` 访问 `GUI` 控制台
 71 | 
 72 | 3. 部署爬虫
 73 | 
 74 | 进入项目, 修改 `scrapy.cfg` 中 `url`
 75 | 
 76 | 命令行输入
 77 | 
 78 | ```c
 79 | scrapyd-deploy [deploy_name]
 80 | ```
 81 | 
 82 | 进行部署,`deploy_name` 为 命令行 
 83 | 
 84 | ```c
 85 | scrapyd-deploy -l
 86 | ```
 87 | 
 88 |  中显示的名字
 89 | 
 90 | 4. 运行爬虫
 91 | 
 92 | 使用API
 93 | 启动一个爬虫
 94 | 
 95 | ```c
 96 | curl http://localhost:6800/schedule.json -d project=PROJECT_NAME -d spider=SPIDER_NAME
 97 | ```
 98 | 
 99 | 停止爬虫
100 | 
101 | ```c
102 | curl http://localhost:6800/cancel.json -d project=PROJECT_NAME -d job=JOB_ID
103 | ```
104 | 
105 | `project_name` 为爬虫`scrapy.cfg`项目名
106 | 
107 | `spider` 为项目中爬虫名字,[Spider文件夹下的]
108 | 
109 | 6. 通过访问 `http://ip:6800` 观察爬虫情况
110 | 
111 | 
112 | 
113 | 
114 | #### Last But Not Least
115 | 
116 | 脚本只是提供提供爬取信息插入数据库收藏，此脚本不包括下载图片功能,如果需要下载图片数据,可运行
117 | 
118 | `python JAVBusImageDownloader.py`
119 | 
120 | 下载数据库中所有信息中的图片,保存地址为文件夹根目录上一级目录，新建 `PYJavBus` 文件夹中
121 | 
122 | 
123 | 


--------------------------------------------------------------------------------
/javbus/Utils/syFileOperator.py:
--------------------------------------------------------------------------------
 1 | # -*- coding:utf-8 -*-
 2 | import os
 3 | import shutil
 4 | 
 5 | #文件类型黑名单
 6 | #不需要统计文件类型
 7 | fileTypeBlackList = ['.git' , '.idea' , '.DS_Store' ]
 8 | 
 9 | 
10 | class syFileOperator():
11 |     def __init__(self):
12 |         #获取当前文件所在的文件夹
13 |         self.currentPath = os.path.dirname(os.path.realpath(__file__))
14 | 
15 | 
16 |     #判断文件路径是否存在文件
17 |     def isExistsFilePath(self, filePath):
18 |         if os.path.exists(filePath) == False:
19 |             return False
20 |         else:
21 |             return True
22 | 
23 | 
24 |     #移除文件夹
25 |     def removeDirPath(self,dirPath):
26 |         if self.isExistsFilePath(dirPath):
27 |             shutil.rmtree(dirPath)
28 | 
29 |     #移除文件
30 |     def removeFilePath(self,filePath):
31 |         if self.isExistsFilePath(filePath):
32 |             os.remove(filePath)
33 | 
34 |     #创建一个新的文件夹
35 |     def createDirPath(self,dirPath):
36 |         if self.isExistsFilePath(dirPath):
37 |             print ('存在文件夹')
38 |         else:
39 |             os.mkdir(dirPath)
40 | 
41 |     #复制文件夹到文件夹
42 |     def copyFileWithDir(self,oriPath,desPathDir):
43 |         if oriPath is None or desPathDir is None:
44 |             return
45 |         if self.isExistsFilePath(oriPath) == False:
46 |             return
47 |         if self.isExistsFilePath(desPathDir) == False:
48 |             self.createDirPath(desPathDir)
49 | 
50 | 
51 |         shutil.copy(oriPath, desPathDir)
52 | 
53 | 
54 |     #获取单文件的大小Size（单位 M）
55 |     def getFileSize(self,filePath):
56 |         if self.isExistsFilePath(filePath) == False:
57 |             return 0
58 |         else:
59 |             size = os.path.getsize(filePath)
60 |             return (size/1024/1024)
61 | 
62 |     #获取文件夹总文件大小
63 |     #返回元组 (文件总大小,文件数量)
64 |     def getDirSize(self,dirPath):
65 |         filesize = 0
66 |         fileCount = 0
67 |         if self.isExistsFilePath(dirPath) == False:
68 |             return (0,0)
69 |         else:
70 | 
71 |             # 三个参数：分别返回1.父目录 2.所有文件夹名字（不含路径） 3.所有文件名字
72 |             for parent , dirnames , filenames in os.walk(dirPath):
73 | 
74 |                 # 文件信息
75 |                 for filename in filenames:
76 |                     currentPath = os.path.join(parent, filename)
77 |                     #排除文件夹
78 |                     if (os.path.isdir(currentPath)) == False:
79 |                         #获取多少M
80 |                         if len(fileTypeBlackList) > 0 :
81 |                             for type in fileTypeBlackList:
82 |                                 if currentPath in type:
83 |                                     continue
84 |                                 else:
85 |                                     filesize += os.path.getsize(currentPath) / 1024 / 1024
86 |                                     fileCount += 1
87 |                         else:
88 |                             filesize += os.path.getsize(currentPath) / 1024 / 1024
89 |                             fileCount += 1
90 | 
91 |         return (filesize , fileCount)
92 | 


--------------------------------------------------------------------------------
/javbus/Utils/JAVBusMySQLDBManager.py:
--------------------------------------------------------------------------------
  1 | # -*- coding:utf-8 -*-
  2 | 
  3 | import pymysql
  4 | from javbus.Utils import syLogger
  5 | import re
  6 | 
  7 | # 单例模式
  8 | def singleton(cls, *args, **kw):
  9 |     instances = {}
 10 | 
 11 |     def _singleton():
 12 |         if cls not in instances:
 13 |             instances[cls] = cls(*args, **kw)
 14 |         return instances[cls]
 15 | 
 16 |     return _singleton
 17 | 
 18 | 
 19 | # 数据库记录类
 20 | @singleton
 21 | class dbManager():
 22 |     javDBName = 'JavBusPython'
 23 |     javTableName = 'javBusTable'
 24 |     DBHost = "localhost"
 25 |     DBPort = 3306
 26 |     DBUser = 'root'
 27 |     DBPassWord = '22f25f9d81f4d21d'
 28 |     DBCharset = 'utf8'
 29 | 
 30 | 
 31 |     def __init__(self):
 32 |         self.logger = syLogger.syLoggerManager()
 33 | 
 34 | 
 35 |     def __executeStrings(self,sqlCommandsList,cursor):
 36 |         if len(sqlCommandsList) == 0 or cursor is None:
 37 |             return
 38 |         try:
 39 |             for sql in sqlCommandsList:
 40 |                 cursor.execute(sql)
 41 |         except Exception as e:
 42 |             self.logger.syLog(str(e))
 43 | 
 44 |     # 执行多sql语句
 45 |     def executeSQLArray(self, sqlStringArray):
 46 |         if len(sqlStringArray) == 0:
 47 |             print ('SQLString数据为空')
 48 |             return
 49 |         conn = pymysql.connect(self.DBHost, self.DBUser, self.DBPassWord, self.javDBName,
 50 |                                charset=self.DBCharset)
 51 |         cursor = conn.cursor()
 52 |         try:
 53 |             for sql in sqlStringArray:
 54 |                 cursor.execute(sql)
 55 |                 print('sql执行成功')
 56 |         except Exception as e:
 57 |             self.logger.syLog(str(e))
 58 |         finally:
 59 |             cursor.close()
 60 |             conn.commit()
 61 |             conn.close()
 62 | 
 63 | 
 64 |     # 执行命令
 65 |     def execute(self, executeString):
 66 |         fetchResult = []
 67 |         conn = pymysql.connect(self.DBHost,self.DBUser,self.DBPassWord,self.javDBName,charset=self.DBCharset)
 68 |         cursor = conn.cursor()
 69 |         try:
 70 |             results = cursor.execute(executeString)
 71 |             fetchResult = cursor.fetchall();
 72 |             print('sql执行成功')
 73 |             # print('executeString:%s'%executeString)
 74 |         except Exception as e:
 75 |             self.logger.syLog(str(e))
 76 |         cursor.close()
 77 |         conn.commit()
 78 |         conn.close()
 79 |         # self.dbLock.release()
 80 |         return fetchResult
 81 | 
 82 | 
 83 | 
 84 | 
 85 |     # 执行命令(字典返回)
 86 |     def executeWithDictReturn(self, executeString):
 87 |         fetchResult = []
 88 |         conn = pymysql.connect(self.DBHost,self.DBUser,self.DBPassWord,self.javDBName,charset=self.DBCharset)
 89 |         cursor = conn.cursor(cursor=pymysql.cursors.DictCursor)
 90 |         try:
 91 |             results = cursor.execute(executeString)
 92 |             fetchResult = cursor.fetchall();
 93 |             print('sql字典执行成功')
 94 |         except Exception as e:
 95 |             self.logger.syLog(str(e))
 96 | 
 97 |         cursor.close()
 98 |         conn.commit()
 99 |         conn.close()
100 |         return fetchResult
101 | 
102 | 


--------------------------------------------------------------------------------
/JAVBusImageDownloader.py:
--------------------------------------------------------------------------------
 1 | # -*- coding:utf-8 -*-
 2 | 
 3 | import requests
 4 | import os
 5 | import queue
 6 | import sys
 7 | from javbus.Utils import syFileOperator
 8 | from javbus.Utils import syLogger
 9 | from javbus.Utils import JAVBusMySQLDBManager
10 | from javbus.Utils import JAVBusImageDownloadQueue
11 | 
12 | 
13 | 
14 | class JAVImageDownloader():
15 | 
16 | 
17 |     def __init__(self):
18 | 
19 |         self.fileOperator = syFileOperator.syFileOperator()
20 |         self.dbManager = JAVBusMySQLDBManager.dbManager()
21 |         self.logger = syLogger.syLoggerManager()
22 |         # 创建资源目录
23 |         #  PYJAVBUS ...  (项目文件根)javbus (同级) /javbus / Utils / currentPath
24 |         self.sourcePath = os.path.join(os.path.dirname(os.path.dirname(os.path.dirname(self.fileOperator.currentPath))),'PYJAVBUS')
25 |         if self.fileOperator.isExistsFilePath(self.sourcePath) == False:
26 |             self.fileOperator.createDirPath(self.sourcePath)
27 |         #Queue
28 |         self.downloadImageQueue = queue.Queue(maxsize = 5)
29 |         #Resource
30 |         self.downloadSource = []
31 | 
32 | 
33 |     def startDownLoadImage(self):
34 |         print('SourceDownloadPath:%s'%self.sourcePath)
35 |         sqlString = '''SELECT * FROM %s'''%self.dbManager.javTableName
36 |         items = self.dbManager.executeWithDictReturn(sqlString)
37 |         if len(sqlString) == 0 or sqlString is None:
38 |             self.logger.syLog('无下载资源')
39 |             return
40 | 
41 |         for item in items:
42 |             if len(item['samplePic']) == 0 or item['samplePic'] is None or len(item['cover']) == 0:
43 |                 self.logger.syLog('无样例图片')
44 |                 continue
45 |             rootPath = item['code']
46 |             rootPath = os.path.join(self.sourcePath, rootPath)
47 |             if self.fileOperator.isExistsFilePath(rootPath) == False:
48 |                 self.fileOperator.createDirPath(rootPath)
49 |             # 样品图
50 |             samplePic = item['samplePic']
51 |             #如果没有图片则不用添加
52 |             if samplePic is not None and len(samplePic) > 0:
53 |                 samplePicArray = samplePic.split('||')
54 |                 for picURLAddress in samplePicArray:
55 |                     downInfoDict = {
56 | 
57 |                         "url":str(picURLAddress),
58 |                         "code":str(item['code']),
59 |                         "type":"samplePic"
60 | 
61 |                     }
62 |                     self.downloadSource.append(downInfoDict)
63 | 
64 |             if item['cover'] is not None :
65 |                 downInfoDict = {
66 | 
67 |                     "url":str(item['cover']),
68 |                     "code":str(item['code']),
69 |                     "type":"cover"
70 | 
71 |                 }
72 |                 self.downloadSource.append(downInfoDict)
73 | 
74 |         #多线程下载
75 |         for i in range(10):
76 |             queue = JAVBusImageDownloadQueue.javBusImageDownloadQueue(self.downloadImageQueue)
77 |             queue.setDaemon(True)
78 |             queue.start()
79 |         for i in range(len(self.downloadSource)):
80 |             self.downloadImageQueue.put(self.downloadSource[i])
81 | 
82 | 
83 |         self.downloadImageQueue.join()
84 | 
85 | 
86 | 
87 | 
88 | 
89 | if __name__ == '__main__':
90 |     sys.path.append(os.path.dirname(os.path.realpath(__file__)))
91 |     imageDownloader = JAVImageDownloader()
92 |     imageDownloader.startDownLoadImage()
93 |     print('结束清除SWAP')
94 |     os.system('date ; sleep 10 ; swapoff -a && swapon -a ; date')
95 |     print('清除SWAP成功')


--------------------------------------------------------------------------------
/javbus/middlewares.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | 
  3 | # Define here the models for your spider middleware
  4 | #
  5 | # See documentation in:
  6 | # https://doc.scrapy.org/en/latest/topics/spider-middleware.html
  7 | 
  8 | from scrapy import signals
  9 | 
 10 | 
 11 | class JavbusSpiderMiddleware(object):
 12 |     # Not all methods need to be defined. If a method is not defined,
 13 |     # scrapy acts as if the spider middleware does not modify the
 14 |     # passed objects.
 15 | 
 16 |     @classmethod
 17 |     def from_crawler(cls, crawler):
 18 |         # This method is used by Scrapy to create your spiders.
 19 |         s = cls()
 20 |         crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
 21 |         return s
 22 | 
 23 |     def process_spider_input(self, response, spider):
 24 |         # Called for each response that goes through the spider
 25 |         # middleware and into the spider.
 26 | 
 27 |         # Should return None or raise an exception.
 28 |         return None
 29 | 
 30 |     def process_spider_output(self, response, result, spider):
 31 |         # Called with the results returned from the Spider, after
 32 |         # it has processed the response.
 33 | 
 34 |         # Must return an iterable of Request, dict or Item objects.
 35 |         for i in result:
 36 |             yield i
 37 | 
 38 |     def process_spider_exception(self, response, exception, spider):
 39 |         # Called when a spider or process_spider_input() method
 40 |         # (from other spider middleware) raises an exception.
 41 | 
 42 |         # Should return either None or an iterable of Response, dict
 43 |         # or Item objects.
 44 |         pass
 45 | 
 46 |     def process_start_requests(self, start_requests, spider):
 47 |         # Called with the start requests of the spider, and works
 48 |         # similarly to the process_spider_output() method, except
 49 |         # that it doesn’t have a response associated.
 50 | 
 51 |         # Must return only requests (not items).
 52 |         for r in start_requests:
 53 |             yield r
 54 | 
 55 |     def spider_opened(self, spider):
 56 |         spider.logger.info('Spider opened: %s' % spider.name)
 57 | 
 58 | 
 59 | class JavbusDownloaderMiddleware(object):
 60 |     # Not all methods need to be defined. If a method is not defined,
 61 |     # scrapy acts as if the downloader middleware does not modify the
 62 |     # passed objects.
 63 | 
 64 |     @classmethod
 65 |     def from_crawler(cls, crawler):
 66 |         # This method is used by Scrapy to create your spiders.
 67 |         s = cls()
 68 |         crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
 69 |         return s
 70 | 
 71 |     def process_request(self, request, spider):
 72 |         # Called for each request that goes through the downloader
 73 |         # middleware.
 74 | 
 75 |         # Must either:
 76 |         # - return None: continue processing this request
 77 |         # - or return a Response object
 78 |         # - or return a Request object
 79 |         # - or raise IgnoreRequest: process_exception() methods of
 80 |         #   installed downloader middleware will be called
 81 |         return None
 82 | 
 83 |     def process_response(self, request, response, spider):
 84 |         # Called with the response returned from the downloader.
 85 | 
 86 |         # Must either;
 87 |         # - return a Response object
 88 |         # - return a Request object
 89 |         # - or raise IgnoreRequest
 90 |         return response
 91 | 
 92 |     def process_exception(self, request, exception, spider):
 93 |         # Called when a download handler or a process_request()
 94 |         # (from other downloader middleware) raises an exception.
 95 | 
 96 |         # Must either:
 97 |         # - return None: continue processing this exception
 98 |         # - return a Response object: stops process_exception() chain
 99 |         # - return a Request object: stops process_exception() chain
100 |         pass
101 | 
102 |     def spider_opened(self, spider):
103 |         spider.logger.info('Spider opened: %s' % spider.name)
104 | 


--------------------------------------------------------------------------------
/javbus/settings.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | 
  3 | # Scrapy settings for javbus project
  4 | #
  5 | # For simplicity, this file contains only settings considered important or
  6 | # commonly used. You can find more settings consulting the documentation:
  7 | #
  8 | #     https://doc.scrapy.org/en/latest/topics/settings.html
  9 | #     https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
 10 | #     https://doc.scrapy.org/en/latest/topics/spider-middleware.html
 11 | import os
 12 | 
 13 | BOT_NAME = 'javbus'
 14 | 
 15 | SPIDER_MODULES = ['javbus.spiders']
 16 | NEWSPIDER_MODULE = 'javbus.spiders'
 17 | SPLASH_URL = 'http://localhost:8050'
 18 | DUPEFILTER_CLASS = 'scrapy_splash.SplashAwareDupeFilter'
 19 | HTTPCACHE_STORAGE = 'scrapy_splash.SplashAwareFSCacheStorage'
 20 | 
 21 | # Crawl responsibly by identifying yourself (and your website) on the user-agent
 22 | #USER_AGENT = 'javbus (+http://www.yourdomain.com)'
 23 | 
 24 | # Obey robots.txt rules
 25 | ROBOTSTXT_OBEY = False
 26 | 
 27 | # Configure maximum concurrent requests performed by Scrapy (default: 16)
 28 | #CONCURRENT_REQUESTS = 32
 29 | CONCURRENT_ITEMS = 60
 30 | 
 31 | # Configure a delay for requests for the same website (default: 0)
 32 | # See https://doc.scrapy.org/en/latest/topics/settings.html#download-delay
 33 | # See also autothrottle settings and docs
 34 | DOWNLOAD_DELAY = 0.25
 35 | # The download delay setting will honor only one of:
 36 | #CONCURRENT_REQUESTS_PER_DOMAIN = 16
 37 | #CONCURRENT_REQUESTS_PER_IP = 16
 38 | 
 39 | # Disable cookies (enabled by default)
 40 | #COOKIES_ENABLED = False
 41 | 
 42 | # Disable Telnet Console (enabled by default)
 43 | #TELNETCONSOLE_ENABLED = False
 44 | 
 45 | # Override the default request headers:
 46 | #DEFAULT_REQUEST_HEADERS = {
 47 | #   'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
 48 | #   'Accept-Language': 'en',
 49 | #}
 50 | 
 51 | # Enable or disable spider middlewares
 52 | # See https://doc.scrapy.org/en/latest/topics/spider-middleware.html
 53 | SPIDER_MIDDLEWARES = {
 54 |    # 'javbus.middlewares.JavbusSpiderMiddleware': 543,
 55 |     'scrapy_splash.SplashDeduplicateArgsMiddleware': 100,
 56 | }
 57 | 
 58 | 
 59 | # Enable or disable downloader middlewares
 60 | # See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
 61 | DOWNLOADER_MIDDLEWARES = {
 62 |     # 'javbus.middlewares.JavbusDownloaderMiddleware': 543,
 63 |     # 'javbus.middlewares.JavBusMiddleware.JavbusMiddleware': 543,
 64 | 
 65 |     'scrapy_splash.SplashCookiesMiddleware': 723,
 66 |     'scrapy_splash.SplashMiddleware': 725,
 67 |     'scrapy.downloadermiddlewares.httpcompression.HttpCompressionMiddleware': 810,
 68 | }
 69 | 
 70 | 
 71 | # Enable or disable extensions
 72 | # See https://doc.scrapy.org/en/latest/topics/extensions.html
 73 | #EXTENSIONS = {
 74 | #    'scrapy.extensions.telnet.TelnetConsole': None,
 75 | #}
 76 | 
 77 | # Configure item pipelines
 78 | # See https://doc.scrapy.org/en/latest/topics/item-pipeline.html
 79 | ITEM_PIPELINES = {
 80 |    'javbus.pipelines.JavbusPipeline': 300,
 81 |     # 'javbus.javImagesPipeline.JavImgDownloadPipeline': 301,
 82 | }
 83 | 
 84 | #设置图片下载路径
 85 | IMAGES_STORE = os.path.join(os.path.dirname(os.getcwd()),'PYJAVBUS')
 86 | # 过期天数
 87 | IMAGES_EXPIRES = 10  #90天内抓取的都不会被重抓
 88 | 
 89 | 
 90 | # Enable and configure the AutoThrottle extension (disabled by default)
 91 | # See https://doc.scrapy.org/en/latest/topics/autothrottle.html
 92 | #AUTOTHROTTLE_ENABLED = True
 93 | # The initial download delay
 94 | #AUTOTHROTTLE_START_DELAY = 5
 95 | # The maximum download delay to be set in case of high latencies
 96 | #AUTOTHROTTLE_MAX_DELAY = 60
 97 | # The average number of requests Scrapy should be sending in parallel to
 98 | # each remote server
 99 | #AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
100 | # Enable showing throttling stats for every response received:
101 | #AUTOTHROTTLE_DEBUG = False
102 | 
103 | # Enable and configure HTTP caching (disabled by default)
104 | # See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
105 | #HTTPCACHE_ENABLED = True
106 | #HTTPCACHE_EXPIRATION_SECS = 0
107 | #HTTPCACHE_DIR = 'httpcache'
108 | #HTTPCACHE_IGNORE_HTTP_CODES = []
109 | #HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
110 | 


--------------------------------------------------------------------------------
/javbus/Utils/JAVBusImageDownloadQueue.py:
--------------------------------------------------------------------------------
 1 | # -*- coding:utf-8 -*-
 2 | import queue
 3 | import threading
 4 | import requests
 5 | from javbus.Utils import syLogger
 6 | from javbus.Utils import syFileOperator
 7 | import os
 8 | 
 9 | class javBusImageDownloadQueue(threading.Thread):
10 |     def __init__(self,queue):
11 |         threading.Thread.__init__(self)
12 |         self.queue = queue
13 |         self.downloadLock = threading.Lock()
14 |         self.fileOperator = syFileOperator.syFileOperator()
15 |         self.logger = syLogger.syLoggerManager()
16 |         # 资源目录
17 |         self.sourcePath = os.path.join(os.path.dirname(os.path.dirname(os.path.dirname(self.fileOperator.currentPath))),'PYJAVBUS')
18 |         self.logInfo = []
19 | 
20 | 
21 |     def run(self):
22 |         while True:
23 |             item = self.queue.get()
24 |             rootPath = item['code']
25 |             rootPath = os.path.join(self.sourcePath, rootPath)
26 |             if self.fileOperator.isExistsFilePath(rootPath) == False:
27 |                 self.fileOperator.createDirPath(rootPath)
28 |             url = item['url']
29 | 
30 |             #封面图
31 |             if item['type'] == 'cover':
32 |                 coverURL = url
33 |                 coverSplit = coverURL.split('/')
34 |                 cover_name = str(coverSplit[len(coverSplit) - 1])
35 |                 coverPath = os.path.join(rootPath, cover_name)
36 |                 if self.fileOperator.isExistsFilePath(coverPath) == True:
37 |                     info = '存在 %s 封面 %s ' % (str(item['code']), cover_name)
38 |                     print(info)
39 |                     self.logInfo.append(info)
40 |                     self.queue.empty()
41 |                     self.queue.task_done()
42 |                 else:
43 |                     try:
44 |                         ir = requests.get(coverURL, timeout=2)
45 |                     except Exception as e:
46 |                         info = '下载 %s 封面 %s 错误 Error : %s ' % (str(item['code']), cover_name, str(e))
47 |                         print(info)
48 |                         self.logInfo.append(info)
49 |                         self.queue.task_done()
50 |                     if ir.status_code == 200:
51 |                         with open(coverPath, 'wb') as f:
52 |                             f.write(ir.content)
53 |                             f.close()
54 |                             info = '下载 %s 封面 %s 成功' % (str(item['code']), cover_name)
55 |                             self.logInfo.append(info)
56 |                             print('===>写入 %s 封面 %s 成功' % (str(item['code']), cover_name))
57 |                             print('CoverPath:%s'%str(coverPath))
58 |                     self.queue.task_done()
59 | 
60 | 
61 | 
62 |             #样例图片下载
63 |             elif item['type'] == 'samplePic':
64 |                 list_name = url.split('/')
65 |                 # 图片名称
66 |                 file_name = str(list_name[len(list_name) - 1])
67 |                 filePath = os.path.join(rootPath, file_name)
68 |                 # 存在图片则不用下载
69 |                 if self.fileOperator.isExistsFilePath(filePath) == True:
70 |                     info = '存在 %s 样品图 %s' % (str(item['code']), file_name)
71 |                     self.logInfo.append(info)
72 |                     print(info)
73 |                     self.queue.empty()
74 |                     self.queue.task_done()
75 |                 else:
76 |                     try:
77 |                         ir = requests.get(url, timeout=2)
78 |                     except Exception as e:
79 |                         info = '下载 %s 样品图 %s 错误 Error : %s ' % (str(item['code']), file_name, str(e))
80 |                         print(info)
81 |                         self.logInfo.append(info)
82 |                         self.queue.task_done()
83 |                     if ir.status_code == 200:
84 |                         with open(filePath, 'wb') as f:
85 |                             f.write(ir.content)
86 |                             f.close()
87 |                             info = '下载 %s 样品图 %s 成功' % (str(item['code']), file_name)
88 |                             self.logInfo.append(info)
89 |                             print('下载 %s 样品图 %s 成功' % (str(item['code']), file_name))
90 |                             print('===>写入 %s 样品图 %s 成功' % (str(item['code']), file_name))
91 |                             print('CoverPath:%s' % str(filePath))
92 |                     self.queue.task_done()
93 | 
94 | 
95 |     def _stop(self):
96 |         # self.logger.syLogManyLines(self.logInfo)
97 |         print('End Queue')
98 | 


--------------------------------------------------------------------------------
/javbus/spiders/JavbusSpider.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | import scrapy
  3 | import os
  4 | import time
  5 | from scrapy.http import Request
  6 | from javbus.items import JavbusItem
  7 | from scrapy_splash import SplashRequest
  8 | 
  9 | 
 10 | class JavbusspiderSpider(scrapy.Spider):
 11 |     #关闭Docker
 12 |     print('关闭DOCKER!')
 13 |     os.system('docker kill splash')
 14 |     os.system('docker rm splash')
 15 |     print('关闭DOCKER命令完!')
 16 | 
 17 |     time.sleep(1)
 18 | 
 19 |     #启动Docker
 20 |     print('启动DOCKER!')
 21 |     os.system('nohup docker run --name splash -p 8050:8050 scrapinghub/splash > /dev/null 2>&1 &')
 22 |     print('启动DOCKER命令完!')
 23 | 
 24 |     time.sleep(3)
 25 | 
 26 |     print('启动爬虫')
 27 | 
 28 |     name = 'JavbusSpider'
 29 |     allowed_domains = ['www.javbus.cc']
 30 |     start_urls = ['https://www.javbus.cc/page/1']
 31 | 
 32 |     header = {'user-agent':'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36'}
 33 | 
 34 | 
 35 |     def start_requests(self):
 36 |         return [Request(self.start_urls[0],callback=self.parse,headers=self.header)]
 37 | 
 38 |     def parse(self, response):
 39 | 
 40 |         javList = response.xpath('//a[@class = "movie-box"]/@href').extract()
 41 |         for javItem in javList:
 42 |             # request =  Request(javItem,callback=self.parse_inner)
 43 |             request = SplashRequest(javItem, callback=self.parse_inner,args={'wait': 10})
 44 |             yield request
 45 |         for i in range(1,2,1):
 46 |             page_url = 'https://www.javbus.cc/page/{}'.format(i)
 47 |             yield Request(page_url,callback=self.parse,meta={'download_timeout' : 10})
 48 | 
 49 | 
 50 |     def parse_inner(self,response):
 51 |         item = JavbusItem()
 52 |         #数组转字符
 53 |         #标题
 54 |         item['title'] = response.xpath('//div[@class = "container"]/h3/text()').extract()
 55 |         item['title'] = item['title'][0].replace('\t', "").replace('\n', "").replace(' ', "")
 56 |         #封面
 57 |         item['cover'] = response.xpath('//a[@class = "bigImage"]/@href').extract()
 58 |         item['cover'] = item['cover'][0]
 59 |         #番号
 60 |         item['code'] = response.xpath('//div[@class = "col-md-3 info"]/ p[1] / span[2]/text()').extract()
 61 |         item['code'] = item['code'][0]
 62 |         #发布日期
 63 |         item['date'] = response.xpath('//div[@class = "col-md-3 info"]/ p[2] / text()').extract()
 64 |         item['date'] = item['date'][0].replace('\t', "").replace('\n', "").replace(' ', "")
 65 |         #时长
 66 |         item['duration'] = response.xpath('//div[@class = "col-md-3 info"]/ p[3] /text()').extract()
 67 |         item['duration'] = item['duration'][0].replace('\t', "").replace('\n', "").replace(' ', "")
 68 |         #系列
 69 |         item['series'] = response.xpath('//span[@class = "genre"]/a/text()').extract()
 70 |         item['series'] = "||".join(item['series'])
 71 |         #类型
 72 |         item['type'] = response.xpath('//span[@class = "genre"]/a/text()').extract()
 73 |         item['type'] = "||".join(item['type'])
 74 |         #演员
 75 |         item['actress'] = response.xpath('//span[@class = "star-toggle"]/text()').extract()
 76 |         if item['actress'] is None or len(item['actress']) == 0:
 77 |             item['actress'] = ''
 78 |         else:
 79 |             item['actress'] = item['actress'][0]
 80 |         #封面图
 81 |         item['samplePic'] = response.xpath('//a[@class = "sample-box"]/@href').extract()
 82 |         item['samplePic'] = "||".join(item['samplePic'])
 83 |         #连接
 84 |         item['link'] = response.url
 85 |         #去重排序
 86 |         item['magnet'] = response.xpath('//table[@id = "magnet-table"]//tr//td/a/@href').extract()
 87 |         newMagnet = []
 88 |         for id in item['magnet']:
 89 |             if id not in newMagnet:
 90 |                 newMagnet.append(id)
 91 |         item['magnet'] = "||".join(newMagnet)
 92 | 
 93 |         #去空格
 94 |         item['size'] =  response.xpath('//table[@id = "magnet-table"]//tr/td[2]/a/text()').extract()
 95 |         item['size'] = ("||".join(item['size'])).strip()
 96 |         item['size'] = item['size'].replace('\t',"").replace('\n',"").replace(' ',"")
 97 |         yield item
 98 | 
 99 |     @staticmethod
100 |     def close(spider, reason):
101 |         #关闭Spider
102 |         print('关闭Spider')
103 |         print('关闭DOCKER!')
104 |         os.system('docker kill splash')
105 |         os.system('docker rm splash')
106 |         print('关闭DOCKER命令完!')
107 |         os.system('date ; sleep 10 ; swapoff -a && swapon -a ; date')
108 |         closed = getattr(spider, 'closed', None)
109 |         if callable(closed):
110 |             return closed(reason)
111 | 
112 | 


--------------------------------------------------------------------------------
/javbus/pipelines.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | 
  3 | # Define your item pipelines here
  4 | #
  5 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting
  6 | # See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html
  7 | import os
  8 | import pymysql
  9 | import requests
 10 | from scrapy.exceptions import DropItem
 11 | 
 12 | 
 13 | class JavbusPipeline(object):
 14 | 
 15 |     def __init__(self):
 16 |         pass
 17 | 
 18 | 
 19 |     @classmethod
 20 |     def from_settings(cls, settings):
 21 |         print("Spider Singleton!")
 22 |         return cls()  # 相当于conn付给了这个类，self中可以得到
 23 | 
 24 |     def open_spider(self, spider):
 25 |         print('open pipelines')
 26 |         javDBName = 'JavBusPython'
 27 |         DBHost = "localhost"
 28 |         DBPort = 3306
 29 |         DBUser = 'root'
 30 |         DBPassWord = '22f25f9d81f4d21d'
 31 |         DBCharset = 'utf8'
 32 |         self.dbConn = pymysql.connect(DBHost, DBUser, DBPassWord, javDBName, charset=DBCharset)
 33 |         # conn = pymysql.connect(host = DBHost, user = DBUser, passwd = DBPassWord , port = DBPort ,charset=DBCharset)
 34 |         self.cursor = self.dbConn.cursor()
 35 |         # cursor.execute('''CREATE DATABASE IF NOT EXISTS JavBusPython''')
 36 |         self.cursor.execute('''CREATE TABLE IF NOT EXISTS javBusTable(
 37 |                                     id          INTEGER           PRIMARY KEY     auto_increment,
 38 |                                     title        TEXT,
 39 |                                     cover      TEXT,
 40 |                                     code       TEXT,
 41 |                                     date   TEXT,
 42 |                                     duration  TEXT,
 43 |                                     series TEXT,
 44 |                                     type TEXT,
 45 |                                     actress TEXT,
 46 |                                     magnet TEXT,
 47 |                                     size TEXT,
 48 |                                     samplePic TEXT,
 49 |                                     link TEXT,
 50 |                                     LastIndexFlag TEXT
 51 |                                     )''')
 52 |         self.cursor.execute("set names 'utf8'")
 53 | 
 54 | 
 55 |     def close_spider(self, spider):
 56 |         self.dbConn.commit()
 57 |         self.cursor.close()
 58 |         self.dbConn.close()
 59 |         print("Spider Done!")
 60 | 
 61 | 
 62 |     def process_item(self, item, spider):
 63 | 
 64 |         self.updateOrInsertItem(item,spider)
 65 | 
 66 |         return item
 67 | 
 68 |     def updateOrInsertItem(self,item,spider):
 69 |         if item['code'] is None or item['code'] =='' :
 70 |             raise DropItem("Missing Content ")
 71 | 
 72 | 
 73 |         sqlString = ''' SELECT * FROM javBusTable where code = '%s' ''' % item['code']
 74 |         self.cursor.execute(sqlString)
 75 |         res = self.cursor.fetchall()
 76 | 
 77 |         title = item['title']
 78 |         cover = item['cover']
 79 |         code = item['code']
 80 |         date = item['date']
 81 |         duration = item['duration']
 82 |         series = item['series']
 83 |         type = item['type']
 84 |         actress = item['actress']
 85 |         magnet = item['magnet']
 86 |         size = item['size']
 87 |         samplePic = item['samplePic']
 88 |         link = item['link']
 89 | 
 90 |         if len(res) == 0:
 91 |             #插入状态
 92 |             print('insert')
 93 |             sqlInsertString = "INSERT INTO javBusTable (title,cover,code,date,duration,series,type,actress,magnet,size,samplePic,link,LastIndexFlag) VALUES ('%s','%s','%s','%s','%s','%s','%s','%s','%s','%s','%s','%s','%s')" % (title, cover, code, date, duration, series, type, actress, magnet, size, samplePic, link, "")
 94 |             try:
 95 |                 self.cursor.execute(sqlInsertString.encode('utf8'))
 96 |             except Exception as e:
 97 |                 print(e)
 98 |                 self.dbConn.rollBack()
 99 | 
100 | 
101 |         else:
102 |             #更新状态
103 |             print('update')
104 |             sqlUpdateString = '''update javBusTable set title = '%s' , cover = '%s' , date = '%s' , duration = '%s' , series = '%s' , type = '%s' , actress = '%s' , magnet = '%s' , size = '%s' , samplePic = '%s' , link = '%s' WHERE code = '%s' ''' %(title,cover,date,duration,series,type,actress,magnet,size,samplePic,link,code)
105 |             try:
106 |                 self.cursor.execute(sqlUpdateString.encode('utf8'))
107 |             except Exception as e:
108 |                 print(e)
109 |                 self.dbConn.rollBack()
110 | 
111 |         self.dbConn.commit()
112 |         # self.downloadImagesWithItem(item = item)
113 | 
114 | 
115 |     def _handle_error(self, failure, item, spider):
116 |         print(failure)
117 | 
118 | 
119 | 
120 | 


--------------------------------------------------------------------------------