├── README.md
├── api.py
├── comic_info.sql
├── spider.py
└── spiderInDB.py


/README.md:
--------------------------------------------------------------------------------
 1 | # ACSpider
 2 | 动画漫画爬虫API,数据源为漫画堆和樱花动漫,应用了python3.5+requests+selenium+phantomjs+flask
 3 | 
 4 | 这是我为了做一个动漫搜索网站而写的爬虫API。(现已废弃)
 5 | 
 6 | 7.29 更改获取动画地址的方式
 7 | 
 8 | 7.28  添加动画爬虫,数据来源于樱花动漫
 9 | 
10 | 
11 | ## 文件作用
12 | 
13 | ### spider.py
14 | 爬虫类，可获取搜索页、详情页和章节页的必要数据
15 | 
16 | ### api.py
17 | API类，为爬虫类创建API
18 | 
19 | ### spiderInDB.py
20 | 存储类，可将漫画堆的所有漫画的可用数据存入mysql数据库(除了漫画具体章节的图片，因为如果加了会使爬取时间变得非常长)
21 | 
22 | ### comic_info.sql
23 | 存储类用到的数据表结构
24 | 
25 | ## 存在的问题
26 | 小问题应该很多，这里只提下我认为比较大的问题
27 | ### Phantomjs 内存占用
28 | Phantomjs在持续爬取的过程中内存占用会越来越大，使服务器崩掉。模拟浏览器关开标签或窗口、清除cookie、改用headless + Chrome都没太大作用，最后还是用了一个笨方法：爬到一定量网页就重启Phantomjs。不过还有问题，重启会阻塞数据的返回，我觉得可以把重启的代码放入新线程里，但是我不太会操作
29 | ### selenium 重试 
30 | (7.21 已解决)
31 | 我不知道selenium有没有内置方法设置重试次数，所以在api.py中写了while来重试，但是好像不起作用，selenium的等待时间设短了就会直接报错
32 | 


--------------------------------------------------------------------------------
/api.py:
--------------------------------------------------------------------------------
 1 | # 此代码创建了爬虫的api
 2 | 
 3 | from flask import Flask, request
 4 | import flask_restful
 5 | from flask_restful import Resource
 6 | from spider import Spider
 7 | import logging
 8 | #设置log级别,过滤access日志,降低内存
 9 | log = logging.getLogger('werkzeug')
10 | log.setLevel(logging.WARNING)
11 | 
12 | app = Flask("Spider")
13 | api = flask_restful.Api(app)
14 | 
15 | sp = Spider()
16 | #漫画搜索页面
17 | class ComicSearch(Resource):
18 |     def get(self):
19 |         kw = request.args.get('kw')
20 |         p = request.args.get('p')
21 |         try: res = sp.comic_search(kw, p)
22 |         except: res = ''
23 |         finally: return res
24 | #漫画详情页面
25 | class ComicItem(Resource):
26 |     def get(self):
27 |         name = request.args.get('slug')
28 |         try: res = sp.comic_item(name)
29 |         except: res = ''
30 |         finally: return res
31 | #漫画章节页面
32 | class ComicImg(Resource):
33 |     def get(self):
34 |         url = request.args.get('ch')
35 |         p = request.args.get('p')
36 |         try: res = sp.comic_img(url, p)
37 |         except: res = ''
38 |         finally: return res
39 | #动画时间表
40 | class AnimateTable(Resource):
41 |     def get(self):
42 |         try: res = sp.animate_table()
43 |         except: res = ''
44 |         finally: return res
45 | #动画搜索页面
46 | class AnimateSearch(Resource):
47 |     def get(self):
48 |         kw = request.args.get('kw')
49 |         try: res = sp.animate_search(kw)
50 |         except: res = ''
51 |         finally: return res
52 | #动画详情页面
53 | class AnimateItem(Resource):
54 |     def get(self):
55 |         url = request.args.get('url')
56 |         try: res = sp.animate_item(url)
57 |         except: res = ''
58 |         finally: return res
59 | #动画章节页面
60 | class AnimateVideo(Resource):
61 |     def get(self):
62 |         url = request.args.get('url')
63 |         try: res = sp.animate_video(url)
64 |         except: res = ''
65 |         finally: return res
66 | #作废
67 | # class Video(Resource):
68 | #     def get(self):
69 | #         src = request.args.get('src')
70 | #         return sp.video(src)
71 | 
72 | api.add_resource(ComicSearch, '/spider/comicsearch')
73 | api.add_resource(ComicItem, '/spider/comicitem')
74 | api.add_resource(ComicImg, '/spider/comicimg')
75 | api.add_resource(AnimateTable, '/spider/animatetable')
76 | api.add_resource(AnimateSearch, '/spider/animatesearch')
77 | api.add_resource(AnimateItem, '/spider/animateitem')
78 | api.add_resource(AnimateVideo, '/spider/animatevideo')
79 | # api.add_resource(Video, '/spider/video')
80 | 
81 | if __name__ == '__main__':
82 |     #主机为本地，端口号为5000,use_reloader=False使代码不会运行两遍
83 |     #api举例：localhost:5000/spider/comicsearch?kw=进&p=1
84 |     app.run(host='localhost', port=5000, debug=True, use_reloader=False)
85 | 


--------------------------------------------------------------------------------
/comic_info.sql:
--------------------------------------------------------------------------------
 1 | /*
 2 | Navicat MySQL Data Transfer
 3 | 
 4 | Source Server         : localhost_3306
 5 | Source Server Version : 50724
 6 | Source Host           : localhost:3306
 7 | Source Database       : coldrain
 8 | 
 9 | Target Server Type    : MYSQL
10 | Target Server Version : 50724
11 | File Encoding         : 65001
12 | 
13 | Date: 2019-07-20 10:48:20
14 | */
15 | 
16 | SET FOREIGN_KEY_CHECKS=0;
17 | 
18 | -- ----------------------------
19 | -- Table structure for comic_info
20 | -- ----------------------------
21 | DROP TABLE IF EXISTS `comic_info`;
22 | CREATE TABLE `comic_info` (
23 |   `cid` int(11) NOT NULL,
24 |   `cname` varchar(255) DEFAULT NULL,
25 |   `cslug` varchar(100) DEFAULT NULL,
26 |   `ccover` varchar(255) DEFAULT NULL,
27 |   `clastname` varchar(255) DEFAULT NULL,
28 |   `cauthor` varchar(255) DEFAULT NULL,
29 |   `cserialise` tinyint(1) DEFAULT NULL,
30 |   `ctype` varchar(255) DEFAULT NULL,
31 |   `ccategory` varchar(25) DEFAULT NULL,
32 |   `carea` varchar(15) DEFAULT NULL,
33 |   `cupdate` varchar(255) DEFAULT NULL,
34 |   `cchapters` text,
35 |   `cchapterurl` text,
36 |   PRIMARY KEY (`cid`),
37 |   KEY `cslug` (`cslug`),
38 |   KEY `cid` (`cid`,`clastname`),
39 |   KEY `cname` (`cname`)
40 | ) ENGINE=InnoDB DEFAULT CHARSET=utf8;
41 | 


--------------------------------------------------------------------------------
/spider.py:
--------------------------------------------------------------------------------
  1 | # coding = utf-8
  2 | #此代码爬取的是漫画堆的移动端
  3 | 
  4 | from lxml import etree
  5 | import requests
  6 | from requests.adapters import HTTPAdapter
  7 | import json
  8 | from selenium import webdriver
  9 | from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
 10 | 
 11 | service_args = []
 12 | service_args.append('--load-images=false')  ##关闭图片加载
 13 | service_args.append('--ignore-ssl-errors=true')  ##忽略https错误 
 14 | service_args.append('--disk-cache=true')  ##开启缓存
 15 | dcap = dict(DesiredCapabilities.PHANTOMJS)
 16 | dcap["phantomjs.page.settings.userAgent"] = (
 17 |     "Mozilla/5.0 (iPhone; CPU iPhone OS 11_0 like Mac OS X) AppleWebKit/604.1.38 (KHTML, like Gecko) Version/11.0 Mobile/15A372 Safari/604.1"
 18 | )
 19 | 
 20 | class Spider:
 21 |     def __init__(self):
 22 |         #计数，达到一定量重启phantomjs
 23 |         self.count = 0
 24 |         self.browser = webdriver.PhantomJS(service_args=service_args, desired_capabilities=dcap)
 25 |         #设置加载页面超时
 26 |         self.browser.set_page_load_timeout(3)
 27 |         self.s = requests.Session()
 28 |         self.s.headers.update({'User-Agent': 'Mozilla/5.0 (iPhone; CPU iPhone OS 11_0 like Mac OS X) AppleWebKit/604.1.38 (KHTML, like Gecko) Version/11.0 Mobile/15A372 Safari/604.1'})
 29 |         #设置requests的重试次数
 30 |         self.s.mount('http://', HTTPAdapter(max_retries=5))
 31 |         self.s.mount('https://', HTTPAdapter(max_retries=5))
 32 | 
 33 |     #爬取搜索结果
 34 |     #kw:关键词
 35 |     #p：页数，每页显示20个结果
 36 |     def comic_search(self, kw, p):
 37 |         URL = 'https://450.manhuadang.net/comic/search'
 38 |         if kw is not None:
 39 |             URL += '?keywords=' + kw
 40 |             if p is not None:
 41 |                 URL += '&page=' + p
 42 |         elif p is not None:
 43 |             URL += '?page=' + p
 44 |         #timeout()设置超时
 45 |         r = self.s.get(URL, timeout=(3, 4))
 46 |         data = json.loads(r.text)
 47 |         r.close()
 48 |         #返回完整结果
 49 |         return data
 50 | 
 51 |     #爬取漫画详情页
 52 |     #name: 漫画名的字母代替，具体参照实际页面
 53 |     def comic_item(self, name):
 54 |         URL = 'https://m.manhuadui.com/manhua/'
 55 |         r = self.s.get(URL + name + "/", timeout=(3, 4))
 56 |         h = etree.HTML(r.text)
 57 |         #漫画名
 58 |         title = "//div[@class='subHeader']/h1[@id='comicName']/text()"
 59 |         #封面url
 60 |         cover = "//div[@id='Cover']/img/@src"
 61 |         #作者
 62 |         author = "//div[@class='sub_r autoHeight']/p[1]/text()"
 63 |         #类型
 64 |         type = "//div[@class='sub_r autoHeight']/p[2]/a/text()"
 65 |         #分类
 66 |         category = "//div[@class='sub_r autoHeight']/p[3]/a[1]/text()"
 67 |         #地区
 68 |         area = "//div[@class='sub_r autoHeight']/p[3]/a[2]/text()"
 69 |         #状态
 70 |         status = "//div[@class='sub_r autoHeight']/p[3]/a[3]/text()"
 71 |         #更新日期
 72 |         update = "//div[@class='sub_r autoHeight']/p[5]/span[2]/text()"
 73 |         #章节名
 74 |         chapterName = "//div[@class='chapter-warp']/ul/li/a/span[1]/text()"
 75 |         #各章节地址
 76 |         chapterURL = "//div[@class='chapter-warp']/ul/li/a/@href"
 77 |         r.close()
 78 |         return {
 79 |             "title": h.xpath(title),
 80 |             "cover": h.xpath(cover),
 81 |             "author": h.xpath(author),
 82 |             "category": h.xpath(category),
 83 |             "type": h.xpath(type),
 84 |             "area": h.xpath(area),
 85 |             "status": h.xpath(status),
 86 |             "update": h.xpath(update),
 87 |             "chapterName": h.xpath(chapterName)[::-1],
 88 |             "chapterURL": h.xpath(chapterURL)[::-1]
 89 |         }
 90 |     #爬取漫画章节具体内容
 91 |     #URL: 该章节具体地址，为移动端地址
 92 |     #p: 页数
 93 |     def comic_img(self, URL, p):
 94 |         if p is not None:
 95 |             URL += '?p=' + p
 96 |         i = 0
 97 |         while i < 6:
 98 |             try:
 99 |                 # 爬取一次，计数
100 |                 self.count = self.count + 1
101 |                 self.browser.get(URL)
102 |                 h = etree.HTML(self.browser.page_source)
103 |                 # 当前章节名
104 |                 this = "//div[@class='subHeader']/a[@class='BarTit']/text()"
105 |                 # 当前章节名
106 |                 thisChapter = h.xpath(this)[0].replace('\n', '').strip()
107 |                 i = 6
108 |             except Exception:
109 |                 i += 1
110 |         #获取前一章的内容
111 |         prev = self.browser.execute_script('return prevChapterData')
112 |         # 获取下一章的内容
113 |         next = self.browser.execute_script('return nextChapterData')
114 |         cover = self.browser.execute_script('return pageImage')
115 |         #漫画名的字母代替
116 |         slug = "//a[@class='iconRet']/@href"
117 |         #漫画名+漫画章节名
118 |         title = "//head/meta[@name='keywords']/@content"
119 |         #当前具体漫画图片的地址
120 |         img = "//div[@id='images']/img/@src"
121 |         #当前页数和总页数
122 |         page = "//div[@id='images']/p/text()"
123 |         #漫画名
124 |         titleName = h.xpath(title)[0].replace(thisChapter, '')
125 |         #因爬取后phantomjs的内存占用越来越多，所以采用这样的笨方法
126 |         #另一点，这里我认为应该新开一个线程来重启，我这样会使等待延长很多，但是我不太会
127 |         #当计数到20之后，重启phantomjs
128 |         # self.browser.delete_all_cookies()
129 |         if(self.count > 20):
130 |             self.browser.quit()
131 |             self.count = 0
132 |             self.browser = webdriver.PhantomJS(service_args=service_args, desired_capabilities=dcap)
133 |             self.browser.set_page_load_timeout(3)
134 |         return {
135 |             'prev': prev,
136 |             'next': next,
137 |             'cover': cover,
138 |             "title": titleName,
139 |             "this": thisChapter,
140 |             "img": h.xpath(img),
141 |             "page": h.xpath(page),
142 |             "slug": h.xpath(slug)[0].replace('https://m.manhuadui.com/manhua/', '')[:-1]
143 |         }
144 | 
145 |     # 动画时间表
146 |     def animate_table(self):
147 |         URL = 'http://m.yhdm.tv'
148 |         r = self.s.get(URL, timeout=(3, 4))
149 |         h = etree.HTML(r.text.encode('ISO-8859-1'))
150 |         titles = []
151 |         urls = []
152 |         news = []
153 |         newUrls = []
154 |         for index in range(7):
155 |             title = "//div[@class='tlist']/ul[%d]/li/a/text()"
156 |             url = "//div[@class='tlist']/ul[%d]/li/a/@href"
157 |             new = "//div[@class='tlist']/ul[%d]/li/span/a/text()"
158 |             newUrl = "//div[@class='tlist']/ul[%d]/li/span/a/@href"
159 |             titles.append(h.xpath(title%(index+1)))
160 |             urls.append(h.xpath(url%(index+1)))
161 |             news.append(h.xpath(new%(index+1)))
162 |             newUrls.append(h.xpath(newUrl%(index+1)))
163 |         r.close()
164 |         return {
165 |             "title": titles,
166 |             "url": urls,
167 |             "new": news,
168 |             "newUrl": newUrls
169 |         }
170 | 
171 |     # 动画搜索
172 |     def animate_search(self, kw):
173 |         URL = 'http://m.yhdm.tv/search/'
174 |         if kw is not None:
175 |             URL += kw
176 |         r = self.s.get(URL, timeout=(3, 4))
177 |         h = etree.HTML(r.text)
178 |         # 动画名
179 |         title = "//a[@class='itemtext']/text()"
180 |         # 封面url
181 |         cover = "//div[@class='imgblock']/@style"
182 |         # 最新集
183 |         new = "//div[@class='itemimgtext']/text()"
184 |         # 地址
185 |         url = "//a[@class='itemtext']/@href"
186 |         r.close()
187 |         return {
188 |             "title": h.xpath(title),
189 |             "cover": h.xpath(cover),
190 |             "new": h.xpath(new),
191 |             "url": h.xpath(url)
192 |         }
193 | 
194 |     # 动画详情页
195 |     def animate_item(self, url):
196 |         URL = 'http://m.yhdm.tv'
197 |         if url is not None:
198 |             URL += url
199 |         r = self.s.get(URL, timeout=(3, 4))
200 |         h = etree.HTML(r.text.encode('ISO-8859-1'))
201 |         # 动画名
202 |         title = "//div[@class='show']/h1/text()"
203 |         # 封面url
204 |         cover = "//div[@class='show']/img/@src"
205 |         # 最新集
206 |         new = "//div[@class='show']/p[2]/text()"
207 |         # 上映日期
208 |         time = "//div[@class='show']/p[3]/text()"
209 |         # 类型
210 |         type = "//div[@class='show']/p[4]/a/text()"
211 |         # 介绍
212 |         info = "//div[@class='info']/text()"
213 |         # 各章节名
214 |         chapterName = "//div[@id='playlists']/ul/li/a/text()"
215 |         # 各章节url
216 |         chapterURL = "//div[@id='playlists']/ul/li/a/@href"
217 |         r.close()
218 |         return {
219 |             "title": h.xpath(title),
220 |             "cover": h.xpath(cover),
221 |             "new": h.xpath(new),
222 |             "time": h.xpath(time),
223 |             "type": h.xpath(type),
224 |             "info": h.xpath(info),
225 |             "chapterName": h.xpath(chapterName),
226 |             "chapterURL": h.xpath(chapterURL)
227 |         }
228 | 
229 |     #动画页面
230 |     def animate_video(self, url):
231 |         URL = 'http://www.yhdm.tv'
232 |         if url is not None:
233 |             URL += url
234 |         r = self.s.get(URL, timeout=(3, 4))
235 |         h = etree.HTML(r.text.encode('ISO-8859-1'))
236 |         # 动画名
237 |         title = "//div[@class='gohome l']/h1/a/text()"
238 |         url =  "//div[@class='gohome l']/h1/a/@href"
239 |         thisName = "//div[@class='gohome l']/h1/span/text()"
240 |         pn = "//div[@class='fav r']/span/text()"
241 |         pnName = "//div[@class='fav r']/a/text()"
242 |         pnURL = "//div[@class='fav r']/a/@href"
243 |         # 播放器容器，需配合该网站js文件
244 |         player = "//div[@id='playbox']/@data-vid"
245 |         # 各章节名
246 |         chapterName = "//div[@class='movurls']/ul/li/a/text()"
247 |         # 各章节url
248 |         chapterURL = "//div[@class='movurls']/ul/li/a/@href"
249 |         # 当前观看
250 |         this = "//div[@class='movurls']/ul/li[@class='sel']/a/@href"
251 |         r.close()
252 |         return {
253 |             "title": h.xpath(title),
254 |             "url": h.xpath(url),
255 |             "thisName": h.xpath(thisName),
256 |             "pn": h.xpath(pn),
257 |             "pnName": h.xpath(pnName),
258 |             "pnURL": h.xpath(pnURL),
259 |             # "player": etree.tostring(h.xpath(player)[0], encoding='utf-8').decode().replace('/>', '>'),
260 |             "player": h.xpath(player)[0],
261 |             "chapterName": h.xpath(chapterName),
262 |             "chapterURL": h.xpath(chapterURL),
263 |             "this": h.xpath(this)
264 |         }
265 |     #可直接获取video地址，现已作废
266 |     def video(self, URL):
267 |         i = 0
268 |         while i < 6:
269 |             try:
270 |                 self.count = self.count + 1
271 |                 self.browser.get(URL)
272 |                 h = etree.HTML(self.browser.page_source)
273 |                 i = 6
274 |             except Exception:
275 |                 i += 1
276 |         if(self.count > 20):
277 |             self.browser.quit()
278 |             self.count = 0
279 |             self.browser = webdriver.PhantomJS(service_args=service_args, desired_capabilities=dcap)
280 |             self.browser.set_page_load_timeout(3)
281 |         return {
282 |             'src': h.xpath('//video/@src')
283 |         }
284 | 
285 | if __name__ == '__main__':
286 |     kw = '进'
287 |     name = 'haizeiwang'
288 |     comic = "https://m.manhuadui.com/manhua/haizeiwang/296660.html"
289 |     sp = Spider()
290 | 
291 |     # search = sp.comic_search(kw, '1')
292 |     # print('comicsearch', search)
293 |     # item = sp.comic_item(name)
294 |     # print('comicitem', item)
295 |     # img = sp.comic_img(comic, '1')
296 |     # print('comicimg', img)
297 |     #
298 |     # table = sp.animate_table()
299 |     # print('animatetable', table)
300 |     # search = sp.animate_search(kw)
301 |     # print('animatesearch', search)
302 |     # nameUrl = '/show/4642.html'
303 |     # item = sp.animate_item(nameUrl)
304 |     # print('animateitem', item)
305 |     videoUrl = '/v/4642-4.html'
306 |     video = sp.animate_video(videoUrl)
307 |     print('animatevideo', video)
308 |     # videoUrl = 'http://tup.yhdm.tv/?vid=http://quan.qq.com/video/1098_da55dca635b47b0826e85e5996f9d65c$mp4&m=1'
309 |     # video = sp.video(videoUrl)
310 |     # print('animatevideo', video)


--------------------------------------------------------------------------------
/spiderInDB.py:
--------------------------------------------------------------------------------
  1 | #此代码将漫画堆的所有漫画信息存入mysql数据库
  2 | #也可用于爬完之后的持续更新
  3 | #此代码采用了多线程爬取，参照的是：https://blog.csdn.net/gyt15663668337/article/details/86345690
  4 | 
  5 | #设定数据库参数,数据表结构在comic_info.sql
  6 | host = "localhost"
  7 | user = "root"
  8 | password = ""
  9 | db = "coldrain"
 10 | port = 3306
 11 | 
 12 | import threading
 13 | import time
 14 | import queue
 15 | import pymysql
 16 | from spider import Spider
 17 | 
 18 | #预置爬取的漫画总数为 258*20
 19 | total = 258
 20 | sql_insert = """insert into comic_info values ('%d',"%s","%s","%s","%s","%s","%s", NULL, NULL, NULL, NULL, NULL, NULL)"""
 21 | sql_select = """select cslug from comic_info where cid = '%d' and clastname != "%s" """
 22 | sql_update = """update comic_info set clastname = "%s", cserialise = '%d' where cid = '%d'"""
 23 | sql_update2 = """update comic_info set ctype = "%s", ccategory = "%s", carea = "%s", cupdate = "%s", cchapters = "%s", cchapterurl = "%s" where cslug = '%s'"""
 24 | threadList = ["Thread-1", "Thread-2", "Thread-3", "Thread-4"]
 25 | 
 26 | sp = Spider()
 27 | #尝试获取总页数
 28 | try:
 29 |     total = sp.comic_search('', '1')['_meta']['pageCount'] + 1
 30 | except:
 31 |     print('查找总页数出错')
 32 | 
 33 | workQueue = queue.Queue(total * 21)
 34 | # 用页码填充队列
 35 | for page in range(1, total):
 36 |     workQueue.put(page)
 37 | spiderUrls = []
 38 | threading.TIMEOUT_MAX = 10
 39 | #设置线程
 40 | class myThread(threading.Thread):
 41 |     def __init__(self, name, q, flag):
 42 |         threading.Thread.__init__(self)
 43 |         self.name = name
 44 |         self.q = q
 45 |         self.flag = flag
 46 |         #数据库连接
 47 |         self.db = pymysql.connect(host=host, user=user,
 48 |                              password=password, db=db, port=port, charset='utf8')
 49 |         self.cur = self.db.cursor()
 50 |     def run(self):
 51 |         print("Starting " + self.name)
 52 |         if (self.flag == 1):
 53 |             while True:
 54 |                 try:    updateComicList(self.name, self.q, self.db, self.cur)
 55 |                 except: break
 56 |         elif (self.flag == 2):
 57 |             while True:
 58 |                 try:    updateComic(self.name, self.q, self.db, self.cur)
 59 |                 except: break
 60 |         self.cur.close()
 61 |         self.db.close()
 62 |         print("Exiting ",self.name)
 63 | 
 64 | #先爬"https://450.manhuadang.net/comic/search"获取漫画列表和需要更新的漫画
 65 | def updateComicList(threadName, q, db, cur):
 66 |     #将页码弹出队列
 67 |     page = q.get(timeout=2)
 68 |     url = "https://450.manhuadang.net/comic/search?page=" + str(page)
 69 |     try:
 70 |         items = sp.comic_search('', str(page))['items']
 71 |         for item in items:
 72 |             try:
 73 |                 #由于封面url部分格式特殊，无法访问，需先处理下，如"https://res.333dm.com/http://mh.manhuazj.com/Uploads/vod/2019-04-02/5ca33ad82f91c.jpg"
 74 |                 coverUrl = item['coverUrl'][item['coverUrl'].rindex('http'):]
 75 |                 #执行插入语句
 76 |                 cur.execute(sql_insert % (item['id'], item['name'], item['slug'], coverUrl
 77 |                                           , item['last_chapter_name'], item['author'], item['serialise']))  # 执行sql语句
 78 |                 db.commit()
 79 |                 #将item['slug']存入数组供后续的详情页爬取
 80 |                 spiderUrls.append(item['slug'])
 81 | 
 82 |             except Exception as ex:
 83 |                 db.rollback()
 84 |             try:
 85 |                 #执行select语句查看数据库中该漫画最新章节和刚爬到的是否一致
 86 |                 cur.execute(sql_select % (item['id'], item['last_chapter_name']))  # 执行sql语句
 87 |                 results = cur.fetchall()
 88 |                 for row in results:
 89 |                     #如果不一致，则更新该漫画的最新章节，并存入数组供后续爬取
 90 |                    print('有更新')
 91 |                    spiderUrls.append(row[0])
 92 |                    try:
 93 |                        cur.execute(sql_update % (item['last_chapter_name'], item['serialise'], item['id']))  # 执行sql语句
 94 |                        db.commit()
 95 |                    except Exception as exc:
 96 |                        db.rollback()
 97 |                        print(exc)
 98 | 
 99 |             except Exception as ex:
100 |                 print(ex)
101 |         print(q.qsize(), threadName, url, len(spiderUrls))
102 |     except Exception as e:
103 |         print(q.qsize(), threadName, url, 'Error', e)
104 |         q.put(page)
105 | 
106 | #爬取漫画详细页
107 | def updateComic(threadName, q, db, cur):
108 |     #弹出漫画名对应的字符串
109 |     name = q.get(timeout=2)
110 |     try:
111 |         res = sp.comic_item(name)
112 |         #爬到的type(类型)、chapterName(章节名)、chapterUrl(章节url)都是数组，将它们用|合并
113 |         type = ''
114 |         for ty in res['type']:
115 |             type += ty + '|'
116 |         type = type[:-1]
117 |         chapterName = ''
118 |         for chapter in res['chapterName']:
119 |             chapterName += chapter + '|'
120 |         chapterName = chapterName[:-1]
121 |         chapterUrl = ''
122 |         for chapter in res['chapterURL']:
123 |             chapterUrl += chapter + '|'
124 |         chapterUrl = chapterUrl[:-1]
125 |         try:    category = res['category'][0]
126 |         except: category = ''
127 |         try:    update = res['update'][0]
128 |         except: update = ''
129 |         try:    area = res['area'][0]
130 |         except: area = ''
131 |         try:
132 |             #执行更新语句
133 |             cur.execute(sql_update2 % (type, category, area, update, chapterName, chapterUrl, name))  # 执行sql语句
134 |             db.commit()
135 |         except Exception as exc:
136 |             db.rollback()
137 |             print(exc)
138 |         print(q.qsize(), threadName, name)
139 |     except Exception as e:
140 |         q.put(name)
141 |         print(q.qsize(), threadName, name, 'Error', e)
142 | 
143 | start = time.time()
144 | # 创建新线程
145 | threads = []
146 | for tName in threadList:
147 |     thread = myThread(tName, workQueue, 1)
148 |     thread.start()
149 |     threads.append(thread)
150 | 
151 | # 等待所有线程完成
152 | for t in threads:
153 |     t.join()
154 | end = time.time()
155 | print("爬取列表总时间为：", end-start)
156 | 
157 | 
158 | time.sleep(2)
159 | 
160 | start = time.time()
161 | for name in spiderUrls:
162 |     workQueue.put(name)
163 | 
164 | threads = []
165 | for tName in threadList:
166 |     thread = myThread(tName, workQueue, 2)
167 |     thread.start()
168 |     threads.append(thread)
169 | 
170 | for t in threads:
171 |     t.join()
172 | end = time.time()
173 | print("爬取详细信息总时间为：", end-start)


--------------------------------------------------------------------------------