├── README.md ├── api.py ├── comic_info.sql ├── spider.py └── spiderInDB.py /README.md: -------------------------------------------------------------------------------- 1 | # ACSpider 2 | 动画漫画爬虫API,数据源为漫画堆和樱花动漫,应用了python3.5+requests+selenium+phantomjs+flask 3 | 4 | 这是我为了做一个动漫搜索网站而写的爬虫API。(现已废弃) 5 | 6 | 7.29 更改获取动画地址的方式 7 | 8 | 7.28 添加动画爬虫,数据来源于樱花动漫 9 | 10 | 11 | ## 文件作用 12 | 13 | ### spider.py 14 | 爬虫类,可获取搜索页、详情页和章节页的必要数据 15 | 16 | ### api.py 17 | API类,为爬虫类创建API 18 | 19 | ### spiderInDB.py 20 | 存储类,可将漫画堆的所有漫画的可用数据存入mysql数据库(除了漫画具体章节的图片,因为如果加了会使爬取时间变得非常长) 21 | 22 | ### comic_info.sql 23 | 存储类用到的数据表结构 24 | 25 | ## 存在的问题 26 | 小问题应该很多,这里只提下我认为比较大的问题 27 | ### Phantomjs 内存占用 28 | Phantomjs在持续爬取的过程中内存占用会越来越大,使服务器崩掉。模拟浏览器关开标签或窗口、清除cookie、改用headless + Chrome都没太大作用,最后还是用了一个笨方法:爬到一定量网页就重启Phantomjs。不过还有问题,重启会阻塞数据的返回,我觉得可以把重启的代码放入新线程里,但是我不太会操作 29 | ### selenium 重试 30 | (7.21 已解决) 31 | 我不知道selenium有没有内置方法设置重试次数,所以在api.py中写了while来重试,但是好像不起作用,selenium的等待时间设短了就会直接报错 32 | -------------------------------------------------------------------------------- /api.py: -------------------------------------------------------------------------------- 1 | # 此代码创建了爬虫的api 2 | 3 | from flask import Flask, request 4 | import flask_restful 5 | from flask_restful import Resource 6 | from spider import Spider 7 | import logging 8 | #设置log级别,过滤access日志,降低内存 9 | log = logging.getLogger('werkzeug') 10 | log.setLevel(logging.WARNING) 11 | 12 | app = Flask("Spider") 13 | api = flask_restful.Api(app) 14 | 15 | sp = Spider() 16 | #漫画搜索页面 17 | class ComicSearch(Resource): 18 | def get(self): 19 | kw = request.args.get('kw') 20 | p = request.args.get('p') 21 | try: res = sp.comic_search(kw, p) 22 | except: res = '' 23 | finally: return res 24 | #漫画详情页面 25 | class ComicItem(Resource): 26 | def get(self): 27 | name = request.args.get('slug') 28 | try: res = sp.comic_item(name) 29 | except: res = '' 30 | finally: return res 31 | #漫画章节页面 32 | class ComicImg(Resource): 33 | def get(self): 34 | url = request.args.get('ch') 35 | p = request.args.get('p') 36 | try: res = sp.comic_img(url, p) 37 | except: res = '' 38 | finally: return res 39 | #动画时间表 40 | class AnimateTable(Resource): 41 | def get(self): 42 | try: res = sp.animate_table() 43 | except: res = '' 44 | finally: return res 45 | #动画搜索页面 46 | class AnimateSearch(Resource): 47 | def get(self): 48 | kw = request.args.get('kw') 49 | try: res = sp.animate_search(kw) 50 | except: res = '' 51 | finally: return res 52 | #动画详情页面 53 | class AnimateItem(Resource): 54 | def get(self): 55 | url = request.args.get('url') 56 | try: res = sp.animate_item(url) 57 | except: res = '' 58 | finally: return res 59 | #动画章节页面 60 | class AnimateVideo(Resource): 61 | def get(self): 62 | url = request.args.get('url') 63 | try: res = sp.animate_video(url) 64 | except: res = '' 65 | finally: return res 66 | #作废 67 | # class Video(Resource): 68 | # def get(self): 69 | # src = request.args.get('src') 70 | # return sp.video(src) 71 | 72 | api.add_resource(ComicSearch, '/spider/comicsearch') 73 | api.add_resource(ComicItem, '/spider/comicitem') 74 | api.add_resource(ComicImg, '/spider/comicimg') 75 | api.add_resource(AnimateTable, '/spider/animatetable') 76 | api.add_resource(AnimateSearch, '/spider/animatesearch') 77 | api.add_resource(AnimateItem, '/spider/animateitem') 78 | api.add_resource(AnimateVideo, '/spider/animatevideo') 79 | # api.add_resource(Video, '/spider/video') 80 | 81 | if __name__ == '__main__': 82 | #主机为本地,端口号为5000,use_reloader=False使代码不会运行两遍 83 | #api举例:localhost:5000/spider/comicsearch?kw=进&p=1 84 | app.run(host='localhost', port=5000, debug=True, use_reloader=False) 85 | -------------------------------------------------------------------------------- /comic_info.sql: -------------------------------------------------------------------------------- 1 | /* 2 | Navicat MySQL Data Transfer 3 | 4 | Source Server : localhost_3306 5 | Source Server Version : 50724 6 | Source Host : localhost:3306 7 | Source Database : coldrain 8 | 9 | Target Server Type : MYSQL 10 | Target Server Version : 50724 11 | File Encoding : 65001 12 | 13 | Date: 2019-07-20 10:48:20 14 | */ 15 | 16 | SET FOREIGN_KEY_CHECKS=0; 17 | 18 | -- ---------------------------- 19 | -- Table structure for comic_info 20 | -- ---------------------------- 21 | DROP TABLE IF EXISTS `comic_info`; 22 | CREATE TABLE `comic_info` ( 23 | `cid` int(11) NOT NULL, 24 | `cname` varchar(255) DEFAULT NULL, 25 | `cslug` varchar(100) DEFAULT NULL, 26 | `ccover` varchar(255) DEFAULT NULL, 27 | `clastname` varchar(255) DEFAULT NULL, 28 | `cauthor` varchar(255) DEFAULT NULL, 29 | `cserialise` tinyint(1) DEFAULT NULL, 30 | `ctype` varchar(255) DEFAULT NULL, 31 | `ccategory` varchar(25) DEFAULT NULL, 32 | `carea` varchar(15) DEFAULT NULL, 33 | `cupdate` varchar(255) DEFAULT NULL, 34 | `cchapters` text, 35 | `cchapterurl` text, 36 | PRIMARY KEY (`cid`), 37 | KEY `cslug` (`cslug`), 38 | KEY `cid` (`cid`,`clastname`), 39 | KEY `cname` (`cname`) 40 | ) ENGINE=InnoDB DEFAULT CHARSET=utf8; 41 | -------------------------------------------------------------------------------- /spider.py: -------------------------------------------------------------------------------- 1 | # coding = utf-8 2 | #此代码爬取的是漫画堆的移动端 3 | 4 | from lxml import etree 5 | import requests 6 | from requests.adapters import HTTPAdapter 7 | import json 8 | from selenium import webdriver 9 | from selenium.webdriver.common.desired_capabilities import DesiredCapabilities 10 | 11 | service_args = [] 12 | service_args.append('--load-images=false') ##关闭图片加载 13 | service_args.append('--ignore-ssl-errors=true') ##忽略https错误 14 | service_args.append('--disk-cache=true') ##开启缓存 15 | dcap = dict(DesiredCapabilities.PHANTOMJS) 16 | dcap["phantomjs.page.settings.userAgent"] = ( 17 | "Mozilla/5.0 (iPhone; CPU iPhone OS 11_0 like Mac OS X) AppleWebKit/604.1.38 (KHTML, like Gecko) Version/11.0 Mobile/15A372 Safari/604.1" 18 | ) 19 | 20 | class Spider: 21 | def __init__(self): 22 | #计数,达到一定量重启phantomjs 23 | self.count = 0 24 | self.browser = webdriver.PhantomJS(service_args=service_args, desired_capabilities=dcap) 25 | #设置加载页面超时 26 | self.browser.set_page_load_timeout(3) 27 | self.s = requests.Session() 28 | self.s.headers.update({'User-Agent': 'Mozilla/5.0 (iPhone; CPU iPhone OS 11_0 like Mac OS X) AppleWebKit/604.1.38 (KHTML, like Gecko) Version/11.0 Mobile/15A372 Safari/604.1'}) 29 | #设置requests的重试次数 30 | self.s.mount('http://', HTTPAdapter(max_retries=5)) 31 | self.s.mount('https://', HTTPAdapter(max_retries=5)) 32 | 33 | #爬取搜索结果 34 | #kw:关键词 35 | #p:页数,每页显示20个结果 36 | def comic_search(self, kw, p): 37 | URL = 'https://450.manhuadang.net/comic/search' 38 | if kw is not None: 39 | URL += '?keywords=' + kw 40 | if p is not None: 41 | URL += '&page=' + p 42 | elif p is not None: 43 | URL += '?page=' + p 44 | #timeout()设置超时 45 | r = self.s.get(URL, timeout=(3, 4)) 46 | data = json.loads(r.text) 47 | r.close() 48 | #返回完整结果 49 | return data 50 | 51 | #爬取漫画详情页 52 | #name: 漫画名的字母代替,具体参照实际页面 53 | def comic_item(self, name): 54 | URL = 'https://m.manhuadui.com/manhua/' 55 | r = self.s.get(URL + name + "/", timeout=(3, 4)) 56 | h = etree.HTML(r.text) 57 | #漫画名 58 | title = "//div[@class='subHeader']/h1[@id='comicName']/text()" 59 | #封面url 60 | cover = "//div[@id='Cover']/img/@src" 61 | #作者 62 | author = "//div[@class='sub_r autoHeight']/p[1]/text()" 63 | #类型 64 | type = "//div[@class='sub_r autoHeight']/p[2]/a/text()" 65 | #分类 66 | category = "//div[@class='sub_r autoHeight']/p[3]/a[1]/text()" 67 | #地区 68 | area = "//div[@class='sub_r autoHeight']/p[3]/a[2]/text()" 69 | #状态 70 | status = "//div[@class='sub_r autoHeight']/p[3]/a[3]/text()" 71 | #更新日期 72 | update = "//div[@class='sub_r autoHeight']/p[5]/span[2]/text()" 73 | #章节名 74 | chapterName = "//div[@class='chapter-warp']/ul/li/a/span[1]/text()" 75 | #各章节地址 76 | chapterURL = "//div[@class='chapter-warp']/ul/li/a/@href" 77 | r.close() 78 | return { 79 | "title": h.xpath(title), 80 | "cover": h.xpath(cover), 81 | "author": h.xpath(author), 82 | "category": h.xpath(category), 83 | "type": h.xpath(type), 84 | "area": h.xpath(area), 85 | "status": h.xpath(status), 86 | "update": h.xpath(update), 87 | "chapterName": h.xpath(chapterName)[::-1], 88 | "chapterURL": h.xpath(chapterURL)[::-1] 89 | } 90 | #爬取漫画章节具体内容 91 | #URL: 该章节具体地址,为移动端地址 92 | #p: 页数 93 | def comic_img(self, URL, p): 94 | if p is not None: 95 | URL += '?p=' + p 96 | i = 0 97 | while i < 6: 98 | try: 99 | # 爬取一次,计数 100 | self.count = self.count + 1 101 | self.browser.get(URL) 102 | h = etree.HTML(self.browser.page_source) 103 | # 当前章节名 104 | this = "//div[@class='subHeader']/a[@class='BarTit']/text()" 105 | # 当前章节名 106 | thisChapter = h.xpath(this)[0].replace('\n', '').strip() 107 | i = 6 108 | except Exception: 109 | i += 1 110 | #获取前一章的内容 111 | prev = self.browser.execute_script('return prevChapterData') 112 | # 获取下一章的内容 113 | next = self.browser.execute_script('return nextChapterData') 114 | cover = self.browser.execute_script('return pageImage') 115 | #漫画名的字母代替 116 | slug = "//a[@class='iconRet']/@href" 117 | #漫画名+漫画章节名 118 | title = "//head/meta[@name='keywords']/@content" 119 | #当前具体漫画图片的地址 120 | img = "//div[@id='images']/img/@src" 121 | #当前页数和总页数 122 | page = "//div[@id='images']/p/text()" 123 | #漫画名 124 | titleName = h.xpath(title)[0].replace(thisChapter, '') 125 | #因爬取后phantomjs的内存占用越来越多,所以采用这样的笨方法 126 | #另一点,这里我认为应该新开一个线程来重启,我这样会使等待延长很多,但是我不太会 127 | #当计数到20之后,重启phantomjs 128 | # self.browser.delete_all_cookies() 129 | if(self.count > 20): 130 | self.browser.quit() 131 | self.count = 0 132 | self.browser = webdriver.PhantomJS(service_args=service_args, desired_capabilities=dcap) 133 | self.browser.set_page_load_timeout(3) 134 | return { 135 | 'prev': prev, 136 | 'next': next, 137 | 'cover': cover, 138 | "title": titleName, 139 | "this": thisChapter, 140 | "img": h.xpath(img), 141 | "page": h.xpath(page), 142 | "slug": h.xpath(slug)[0].replace('https://m.manhuadui.com/manhua/', '')[:-1] 143 | } 144 | 145 | # 动画时间表 146 | def animate_table(self): 147 | URL = 'http://m.yhdm.tv' 148 | r = self.s.get(URL, timeout=(3, 4)) 149 | h = etree.HTML(r.text.encode('ISO-8859-1')) 150 | titles = [] 151 | urls = [] 152 | news = [] 153 | newUrls = [] 154 | for index in range(7): 155 | title = "//div[@class='tlist']/ul[%d]/li/a/text()" 156 | url = "//div[@class='tlist']/ul[%d]/li/a/@href" 157 | new = "//div[@class='tlist']/ul[%d]/li/span/a/text()" 158 | newUrl = "//div[@class='tlist']/ul[%d]/li/span/a/@href" 159 | titles.append(h.xpath(title%(index+1))) 160 | urls.append(h.xpath(url%(index+1))) 161 | news.append(h.xpath(new%(index+1))) 162 | newUrls.append(h.xpath(newUrl%(index+1))) 163 | r.close() 164 | return { 165 | "title": titles, 166 | "url": urls, 167 | "new": news, 168 | "newUrl": newUrls 169 | } 170 | 171 | # 动画搜索 172 | def animate_search(self, kw): 173 | URL = 'http://m.yhdm.tv/search/' 174 | if kw is not None: 175 | URL += kw 176 | r = self.s.get(URL, timeout=(3, 4)) 177 | h = etree.HTML(r.text) 178 | # 动画名 179 | title = "//a[@class='itemtext']/text()" 180 | # 封面url 181 | cover = "//div[@class='imgblock']/@style" 182 | # 最新集 183 | new = "//div[@class='itemimgtext']/text()" 184 | # 地址 185 | url = "//a[@class='itemtext']/@href" 186 | r.close() 187 | return { 188 | "title": h.xpath(title), 189 | "cover": h.xpath(cover), 190 | "new": h.xpath(new), 191 | "url": h.xpath(url) 192 | } 193 | 194 | # 动画详情页 195 | def animate_item(self, url): 196 | URL = 'http://m.yhdm.tv' 197 | if url is not None: 198 | URL += url 199 | r = self.s.get(URL, timeout=(3, 4)) 200 | h = etree.HTML(r.text.encode('ISO-8859-1')) 201 | # 动画名 202 | title = "//div[@class='show']/h1/text()" 203 | # 封面url 204 | cover = "//div[@class='show']/img/@src" 205 | # 最新集 206 | new = "//div[@class='show']/p[2]/text()" 207 | # 上映日期 208 | time = "//div[@class='show']/p[3]/text()" 209 | # 类型 210 | type = "//div[@class='show']/p[4]/a/text()" 211 | # 介绍 212 | info = "//div[@class='info']/text()" 213 | # 各章节名 214 | chapterName = "//div[@id='playlists']/ul/li/a/text()" 215 | # 各章节url 216 | chapterURL = "//div[@id='playlists']/ul/li/a/@href" 217 | r.close() 218 | return { 219 | "title": h.xpath(title), 220 | "cover": h.xpath(cover), 221 | "new": h.xpath(new), 222 | "time": h.xpath(time), 223 | "type": h.xpath(type), 224 | "info": h.xpath(info), 225 | "chapterName": h.xpath(chapterName), 226 | "chapterURL": h.xpath(chapterURL) 227 | } 228 | 229 | #动画页面 230 | def animate_video(self, url): 231 | URL = 'http://www.yhdm.tv' 232 | if url is not None: 233 | URL += url 234 | r = self.s.get(URL, timeout=(3, 4)) 235 | h = etree.HTML(r.text.encode('ISO-8859-1')) 236 | # 动画名 237 | title = "//div[@class='gohome l']/h1/a/text()" 238 | url = "//div[@class='gohome l']/h1/a/@href" 239 | thisName = "//div[@class='gohome l']/h1/span/text()" 240 | pn = "//div[@class='fav r']/span/text()" 241 | pnName = "//div[@class='fav r']/a/text()" 242 | pnURL = "//div[@class='fav r']/a/@href" 243 | # 播放器容器,需配合该网站js文件 244 | player = "//div[@id='playbox']/@data-vid" 245 | # 各章节名 246 | chapterName = "//div[@class='movurls']/ul/li/a/text()" 247 | # 各章节url 248 | chapterURL = "//div[@class='movurls']/ul/li/a/@href" 249 | # 当前观看 250 | this = "//div[@class='movurls']/ul/li[@class='sel']/a/@href" 251 | r.close() 252 | return { 253 | "title": h.xpath(title), 254 | "url": h.xpath(url), 255 | "thisName": h.xpath(thisName), 256 | "pn": h.xpath(pn), 257 | "pnName": h.xpath(pnName), 258 | "pnURL": h.xpath(pnURL), 259 | # "player": etree.tostring(h.xpath(player)[0], encoding='utf-8').decode().replace('/>', '>'), 260 | "player": h.xpath(player)[0], 261 | "chapterName": h.xpath(chapterName), 262 | "chapterURL": h.xpath(chapterURL), 263 | "this": h.xpath(this) 264 | } 265 | #可直接获取video地址,现已作废 266 | def video(self, URL): 267 | i = 0 268 | while i < 6: 269 | try: 270 | self.count = self.count + 1 271 | self.browser.get(URL) 272 | h = etree.HTML(self.browser.page_source) 273 | i = 6 274 | except Exception: 275 | i += 1 276 | if(self.count > 20): 277 | self.browser.quit() 278 | self.count = 0 279 | self.browser = webdriver.PhantomJS(service_args=service_args, desired_capabilities=dcap) 280 | self.browser.set_page_load_timeout(3) 281 | return { 282 | 'src': h.xpath('//video/@src') 283 | } 284 | 285 | if __name__ == '__main__': 286 | kw = '进' 287 | name = 'haizeiwang' 288 | comic = "https://m.manhuadui.com/manhua/haizeiwang/296660.html" 289 | sp = Spider() 290 | 291 | # search = sp.comic_search(kw, '1') 292 | # print('comicsearch', search) 293 | # item = sp.comic_item(name) 294 | # print('comicitem', item) 295 | # img = sp.comic_img(comic, '1') 296 | # print('comicimg', img) 297 | # 298 | # table = sp.animate_table() 299 | # print('animatetable', table) 300 | # search = sp.animate_search(kw) 301 | # print('animatesearch', search) 302 | # nameUrl = '/show/4642.html' 303 | # item = sp.animate_item(nameUrl) 304 | # print('animateitem', item) 305 | videoUrl = '/v/4642-4.html' 306 | video = sp.animate_video(videoUrl) 307 | print('animatevideo', video) 308 | # videoUrl = 'http://tup.yhdm.tv/?vid=http://quan.qq.com/video/1098_da55dca635b47b0826e85e5996f9d65c$mp4&m=1' 309 | # video = sp.video(videoUrl) 310 | # print('animatevideo', video) -------------------------------------------------------------------------------- /spiderInDB.py: -------------------------------------------------------------------------------- 1 | #此代码将漫画堆的所有漫画信息存入mysql数据库 2 | #也可用于爬完之后的持续更新 3 | #此代码采用了多线程爬取,参照的是:https://blog.csdn.net/gyt15663668337/article/details/86345690 4 | 5 | #设定数据库参数,数据表结构在comic_info.sql 6 | host = "localhost" 7 | user = "root" 8 | password = "" 9 | db = "coldrain" 10 | port = 3306 11 | 12 | import threading 13 | import time 14 | import queue 15 | import pymysql 16 | from spider import Spider 17 | 18 | #预置爬取的漫画总数为 258*20 19 | total = 258 20 | sql_insert = """insert into comic_info values ('%d',"%s","%s","%s","%s","%s","%s", NULL, NULL, NULL, NULL, NULL, NULL)""" 21 | sql_select = """select cslug from comic_info where cid = '%d' and clastname != "%s" """ 22 | sql_update = """update comic_info set clastname = "%s", cserialise = '%d' where cid = '%d'""" 23 | sql_update2 = """update comic_info set ctype = "%s", ccategory = "%s", carea = "%s", cupdate = "%s", cchapters = "%s", cchapterurl = "%s" where cslug = '%s'""" 24 | threadList = ["Thread-1", "Thread-2", "Thread-3", "Thread-4"] 25 | 26 | sp = Spider() 27 | #尝试获取总页数 28 | try: 29 | total = sp.comic_search('', '1')['_meta']['pageCount'] + 1 30 | except: 31 | print('查找总页数出错') 32 | 33 | workQueue = queue.Queue(total * 21) 34 | # 用页码填充队列 35 | for page in range(1, total): 36 | workQueue.put(page) 37 | spiderUrls = [] 38 | threading.TIMEOUT_MAX = 10 39 | #设置线程 40 | class myThread(threading.Thread): 41 | def __init__(self, name, q, flag): 42 | threading.Thread.__init__(self) 43 | self.name = name 44 | self.q = q 45 | self.flag = flag 46 | #数据库连接 47 | self.db = pymysql.connect(host=host, user=user, 48 | password=password, db=db, port=port, charset='utf8') 49 | self.cur = self.db.cursor() 50 | def run(self): 51 | print("Starting " + self.name) 52 | if (self.flag == 1): 53 | while True: 54 | try: updateComicList(self.name, self.q, self.db, self.cur) 55 | except: break 56 | elif (self.flag == 2): 57 | while True: 58 | try: updateComic(self.name, self.q, self.db, self.cur) 59 | except: break 60 | self.cur.close() 61 | self.db.close() 62 | print("Exiting ",self.name) 63 | 64 | #先爬"https://450.manhuadang.net/comic/search"获取漫画列表和需要更新的漫画 65 | def updateComicList(threadName, q, db, cur): 66 | #将页码弹出队列 67 | page = q.get(timeout=2) 68 | url = "https://450.manhuadang.net/comic/search?page=" + str(page) 69 | try: 70 | items = sp.comic_search('', str(page))['items'] 71 | for item in items: 72 | try: 73 | #由于封面url部分格式特殊,无法访问,需先处理下,如"https://res.333dm.com/http://mh.manhuazj.com/Uploads/vod/2019-04-02/5ca33ad82f91c.jpg" 74 | coverUrl = item['coverUrl'][item['coverUrl'].rindex('http'):] 75 | #执行插入语句 76 | cur.execute(sql_insert % (item['id'], item['name'], item['slug'], coverUrl 77 | , item['last_chapter_name'], item['author'], item['serialise'])) # 执行sql语句 78 | db.commit() 79 | #将item['slug']存入数组供后续的详情页爬取 80 | spiderUrls.append(item['slug']) 81 | 82 | except Exception as ex: 83 | db.rollback() 84 | try: 85 | #执行select语句查看数据库中该漫画最新章节和刚爬到的是否一致 86 | cur.execute(sql_select % (item['id'], item['last_chapter_name'])) # 执行sql语句 87 | results = cur.fetchall() 88 | for row in results: 89 | #如果不一致,则更新该漫画的最新章节,并存入数组供后续爬取 90 | print('有更新') 91 | spiderUrls.append(row[0]) 92 | try: 93 | cur.execute(sql_update % (item['last_chapter_name'], item['serialise'], item['id'])) # 执行sql语句 94 | db.commit() 95 | except Exception as exc: 96 | db.rollback() 97 | print(exc) 98 | 99 | except Exception as ex: 100 | print(ex) 101 | print(q.qsize(), threadName, url, len(spiderUrls)) 102 | except Exception as e: 103 | print(q.qsize(), threadName, url, 'Error', e) 104 | q.put(page) 105 | 106 | #爬取漫画详细页 107 | def updateComic(threadName, q, db, cur): 108 | #弹出漫画名对应的字符串 109 | name = q.get(timeout=2) 110 | try: 111 | res = sp.comic_item(name) 112 | #爬到的type(类型)、chapterName(章节名)、chapterUrl(章节url)都是数组,将它们用|合并 113 | type = '' 114 | for ty in res['type']: 115 | type += ty + '|' 116 | type = type[:-1] 117 | chapterName = '' 118 | for chapter in res['chapterName']: 119 | chapterName += chapter + '|' 120 | chapterName = chapterName[:-1] 121 | chapterUrl = '' 122 | for chapter in res['chapterURL']: 123 | chapterUrl += chapter + '|' 124 | chapterUrl = chapterUrl[:-1] 125 | try: category = res['category'][0] 126 | except: category = '' 127 | try: update = res['update'][0] 128 | except: update = '' 129 | try: area = res['area'][0] 130 | except: area = '' 131 | try: 132 | #执行更新语句 133 | cur.execute(sql_update2 % (type, category, area, update, chapterName, chapterUrl, name)) # 执行sql语句 134 | db.commit() 135 | except Exception as exc: 136 | db.rollback() 137 | print(exc) 138 | print(q.qsize(), threadName, name) 139 | except Exception as e: 140 | q.put(name) 141 | print(q.qsize(), threadName, name, 'Error', e) 142 | 143 | start = time.time() 144 | # 创建新线程 145 | threads = [] 146 | for tName in threadList: 147 | thread = myThread(tName, workQueue, 1) 148 | thread.start() 149 | threads.append(thread) 150 | 151 | # 等待所有线程完成 152 | for t in threads: 153 | t.join() 154 | end = time.time() 155 | print("爬取列表总时间为:", end-start) 156 | 157 | 158 | time.sleep(2) 159 | 160 | start = time.time() 161 | for name in spiderUrls: 162 | workQueue.put(name) 163 | 164 | threads = [] 165 | for tName in threadList: 166 | thread = myThread(tName, workQueue, 2) 167 | thread.start() 168 | threads.append(thread) 169 | 170 | for t in threads: 171 | t.join() 172 | end = time.time() 173 | print("爬取详细信息总时间为:", end-start) --------------------------------------------------------------------------------